diff --git a/.gitignore b/.gitignore index 7d4937541..b8f113f92 100644 --- a/.gitignore +++ b/.gitignore @@ -199,6 +199,7 @@ test_quad_irreg_interp test_quad_reg_interp test_table_read test_ran_unif +test_kde_dist # Directories to NOT IGNORE ... same as executable names # as far as I know, these must be listed after the executables diff --git a/CHANGELOG.rst b/CHANGELOG.rst index db39b1910..b0ed5bc38 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -22,6 +22,14 @@ individual files. The changes are now listed with the most recent at the top. +**August 26 2024 :: KQCEF. Tag 11.7.0** + +- Adds a Quantile-Conserving Ensemble Filter Based on Kernel-Density Estimation to DART. +- New distribution module kde_distribution_mod. +- New module rootfinding_mod that provides a different implementation of inv_cdf. + +*Contributed by Ian Grooms* + **August 15 2024 :: WRF fwd operator bug fixes. Tag v11.6.1** WRF-DART bug-fixes: diff --git a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 index 42d5f0c21..41887d4ca 100644 --- a/assimilation_code/modules/assimilation/algorithm_info_mod.f90 +++ b/assimilation_code/modules/assimilation/algorithm_info_mod.f90 @@ -22,7 +22,7 @@ module algorithm_info_mod use distribution_params_mod, only : NORMAL_DISTRIBUTION, BOUNDED_NORMAL_RH_DISTRIBUTION, & GAMMA_DISTRIBUTION, BETA_DISTRIBUTION, LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, & - PARTICLE_FILTER_DISTRIBUTION + PARTICLE_FILTER_DISTRIBUTION, KDE_DISTRIBUTION implicit none private @@ -40,12 +40,13 @@ module algorithm_info_mod integer, parameter :: OBS_PARTICLE = 4 integer, parameter :: UNBOUNDED_RHF = 8 integer, parameter :: GAMMA_FILTER = 11 -integer, parameter :: BOUNDED_NORMAL_RHF = 101 +integer, parameter :: BOUNDED_NORMAL_RHF = 101 +integer, parameter :: KDE_FILTER = 102 public :: obs_error_info, probit_dist_info, obs_inc_info, & init_algorithm_info_mod, end_algorithm_info_mod, & EAKF, ENKF, BOUNDED_NORMAL_RHF, UNBOUNDED_RHF, & - GAMMA_FILTER, KERNEL, OBS_PARTICLE + GAMMA_FILTER, KERNEL, OBS_PARTICLE, KDE_FILTER ! type definitions for the QCF table type obs_error_info_type @@ -223,6 +224,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%probit_inflation%dist_type = LOG_NORMAL_DISTRIBUTION case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_inflation%dist_type = UNIFORM_DISTRIBUTION + case ('KDE_DISTRIBUTION') + qceff_table_data(row)%probit_inflation%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_inflation%dist_type = PARTICLE_FILTER_DISTRIBUTION case default @@ -246,6 +249,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%probit_state%dist_type = LOG_NORMAL_DISTRIBUTION case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_state%dist_type = UNIFORM_DISTRIBUTION + case ('KDE_DISTRIBUTION') + qceff_table_data(row)%probit_state%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_state%dist_type = PARTICLE_FILTER_DISTRIBUTION case default @@ -268,6 +273,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%probit_extended_state%dist_type = LOG_NORMAL_DISTRIBUTION case ('UNIFORM_DISTRIBUTION') qceff_table_data(row)%probit_extended_state%dist_type = UNIFORM_DISTRIBUTION + case ('KDE_DISTRIBUTION') + qceff_table_data(row)%probit_extended_state%dist_type = KDE_DISTRIBUTION !!!case ('PARTICLE_FILTER_DISTRIBUTION') !!!qceff_table_data(row)%probit_extended_state%dist_type = PARTICLE_FILTER_DISTRIBUTION case default @@ -290,6 +297,8 @@ subroutine read_qceff_table(qceff_table_filename) qceff_table_data(row)%obs_inc_info%filter_kind = GAMMA_FILTER case ('BOUNDED_NORMAL_RHF') qceff_table_data(row)%obs_inc_info%filter_kind = BOUNDED_NORMAL_RHF + case ('KDE_FILTER') + qceff_table_data(row)%obs_inc_info%filter_kind = KDE_FILTER case default write(errstring, *) 'Invalid filter kind: ', trim(filter_kind_string(row)) call error_handler(E_ERR, 'read_qceff_table:', errstring, source) diff --git a/assimilation_code/modules/assimilation/assim_tools_mod.f90 b/assimilation_code/modules/assimilation/assim_tools_mod.f90 index d1b6e2bbf..5787c0beb 100644 --- a/assimilation_code/modules/assimilation/assim_tools_mod.f90 +++ b/assimilation_code/modules/assimilation/assim_tools_mod.f90 @@ -78,13 +78,17 @@ module assim_tools_mod use algorithm_info_mod, only : probit_dist_info, obs_inc_info, EAKF, ENKF, & BOUNDED_NORMAL_RHF, UNBOUNDED_RHF, GAMMA_FILTER, & - KERNEL, OBS_PARTICLE + KERNEL, OBS_PARTICLE, KDE_FILTER use gamma_distribution_mod, only : gamma_cdf, inv_gamma_cdf, gamma_mn_var_to_shape_scale, & gamma_gamma_prod use bnrh_distribution_mod, only : inv_bnrh_cdf, bnrh_cdf, inv_bnrh_cdf_like +use kde_distribution_mod, only : kde_cdf_params, inv_kde_cdf_params, obs_dist_types, & + pack_kde_params, likelihood_function, separate_ensemble, & + obs_increment_kde + use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params @@ -1025,6 +1029,10 @@ subroutine obs_increment(ens_in, ens_size, obs, obs_var, obs_kind, obs_inc, & !!!endif !-------------------------------------------------------------------------- + + else if(filter_kind == KDE_FILTER) then + call obs_increment_kde(ens, ens_size, obs, obs_var, bounded_below, & + bounded_above, lower_bound, upper_bound, obs_inc) else call error_handler(E_ERR,'obs_increment', & 'Illegal value of filter_kind', source) @@ -1166,9 +1174,6 @@ subroutine obs_increment_bounded_norm_rhf(ens, ens_like, ens_size, prior_var, & end subroutine obs_increment_bounded_norm_rhf - - - ! Computes a normal or truncated normal (above and/or below) likelihood. function get_truncated_normal_like(x, obs, obs_var, & bounded_below, bounded_above, lower_bound, upper_bound) @@ -1420,6 +1425,8 @@ subroutine obs_increment_kernel(ens, ens_size, obs, obs_var, obs_inc) endif ! Generate a uniform random number and a Gaussian for each new member +!! IG: This could be done with a single random number and in a way that +!! preserves the order of the ensemble members do i = 1, ens_size unif = random_uniform(inc_ran_seq) ! Figure out which kernel it's in diff --git a/assimilation_code/modules/assimilation/distribution_params_mod.f90 b/assimilation_code/modules/assimilation/distribution_params_mod.f90 index 7951a7c7b..050781078 100644 --- a/assimilation_code/modules/assimilation/distribution_params_mod.f90 +++ b/assimilation_code/modules/assimilation/distribution_params_mod.f90 @@ -25,10 +25,12 @@ module distribution_params_mod integer, parameter :: LOG_NORMAL_DISTRIBUTION = 5 integer, parameter :: UNIFORM_DISTRIBUTION = 6 integer, parameter :: PARTICLE_FILTER_DISTRIBUTION = 7 +integer, parameter :: KDE_DISTRIBUTION = 8 public :: distribution_params_type, deallocate_distribution_params, & NORMAL_DISTRIBUTION, BOUNDED_NORMAL_RH_DISTRIBUTION, GAMMA_DISTRIBUTION, BETA_DISTRIBUTION, & - LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, PARTICLE_FILTER_DISTRIBUTION + LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, PARTICLE_FILTER_DISTRIBUTION, & + KDE_DISTRIBUTION contains diff --git a/assimilation_code/modules/assimilation/kde_distribution_mod.f90 b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 new file mode 100644 index 000000000..3dee62db7 --- /dev/null +++ b/assimilation_code/modules/assimilation/kde_distribution_mod.f90 @@ -0,0 +1,1247 @@ +! DART software - Copyright UCAR. This open source software is provided +! by UCAR, "as is", without charge, subject to all terms of use at +! http://www.image.ucar.edu/DAReS/DART/DART_download + +! This module originally written by I. Grooms and described (briefly) +! in "A Quantile-Conserving Ensemble Filter Based on Kernel-Density Estimation" +! by Grooms & Riedel (Remote Sensing, 2024; DOI:10.3390/rs16132377). + +module kde_distribution_mod + +use types_mod, only : r8, MISSING_R8 + +use utilities_mod, only : E_ERR, E_MSG, error_handler + +use sort_mod, only : sort + +use random_seq_mod, only : random_seq_type, random_gaussian, init_random_seq, & + random_uniform + +use mpi_utilities_mod, only : my_task_id + +use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params, & + KDE_DISTRIBUTION + +use rootfinding_mod, only : inv_cdf + +use normal_distribution_mod, only : normal_cdf + +implicit none +private + +public :: kde_cdf, kde_cdf_params, inv_kde_cdf, inv_kde_cdf_params, & + test_kde, obs_dist_types, likelihood_function, pack_kde_params, & + separate_ensemble, obs_increment_kde + +type available_obs_dist_types + integer :: uninformative, normal, binomial, gamma, & + inv_gamma, lognormal, truncated_normal +end type + +type(available_obs_dist_types), parameter :: obs_dist_types = & +available_obs_dist_types(uninformative=999, normal=998, binomial=997, & + gamma=996, inv_gamma=995, lognormal=994, truncated_normal=993) +character(len=512) :: errstring +character(len=*), parameter :: source = 'kde_distribution_mod.f90' + +! True if random sequence needs to be initialized +logical :: first_inc_ran_call = .true. +type (random_seq_type) :: inc_ran_seq + +contains + +!--------------------------------------------------------------------------- + +function likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above, bounded_below, upper_bound, lower_bound) result(l) + real(r8) :: l ! likelihood value + real(r8), intent(in) :: x ! state value + real(r8), intent(in) :: y ! obs value + real(r8), intent(in) :: obs_param ! meaning depends on obs_dist_type + integer, intent(in) :: obs_dist_type ! obs distribution type + logical, optional, intent(in) :: bounded_above, bounded_below + real(r8), optional, intent(in) :: upper_bound, lower_bound + + ! Evaluates the likelihood pdf(y | x) for various kinds of observation + ! distributions. The value returned is not equal to the observation + ! pdf evaluated at y, because normalization constants that don't depend + ! on x are omitted. + + real(r8) :: gamma_shape, gamma_scale + real(r8) :: inv_gamma_shape, inv_gamma_scale + real(r8) :: cdf(2) + logical :: bounded_above_val, bounded_below_val + + l = 1._r8 ! Initialize + + select case (obs_dist_type) + case (obs_dist_types%uninformative) + ! Uninformative observations have a likelihood equal to one. + l = 1._r8 + case (obs_dist_types%normal) + ! For a normal obs distribution, like_param is the obs error variance + if (obs_param <= 0._r8) then + write(errstring, *) 'obs error sd <= 0', obs_param + call error_handler(E_ERR, 'kde_distribution_mod:likelihood', errstring, source) + else + l = exp( -0.5_r8 * (x - y)**2 / obs_param ) + end if + case (obs_dist_types%binomial) + ! For a binomial obs distribution 0<=x<=1 is the probability of + ! observing y successes of obs_param total trials + if (y < 0._r8) then + write(errstring, *) 'y value is negative with a binomial obs model ', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (y > obs_param) then + write(errstring, *) 'successes greater than total trials with a binomial obs model ', y, obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif ((x < 0._r8) .or. (x > 1._r8)) then + write(errstring, *) 'x outside [0,1] with a binomial obs model ', x + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + l = (x**y) * (1._r8 - x)**(obs_param - y) + end if + case (obs_dist_types%gamma) + ! For a gamma obs distribution, the mean is x and the variance is obs_param * x^2, i.e. + ! the obs error sd is sqrt(obs_param) times the true value. If the error sd is p% of x, + ! set obs_param = (p/100._r8)**2. + ! For a gamma obs distribution, the likelihood is inverse gamma. + if (x < 0._r8) then + write(errstring, *) 'x value is negative with a gamma obs model ', x + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (y <= 0._r8) then + write(errstring, *) 'y value is non-positive with a gamma obs model ', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (obs_param <= 0._r8) then + write(errstring, *) 'obs variance is non-positive with a gamma obs model ', obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + gamma_shape = 1._r8 / obs_param ! k + gamma_scale = x * obs_param ! theta + if (x == 0._r8) then + l = 0._r8 ! Technically x must be > 0, but just setting l=0 is convenient. + else + l = (y / gamma_scale)**gamma_shape * exp(-y / gamma_scale) + end if + end if + case (obs_dist_types%inv_gamma) + ! For an inverse gamma obs distribution, the mean is x and the variance is obs_param * x^2, + ! i.e. the obs error sd is sqrt(obs_param) times the true value. If the error sd is p% of x, + ! set obs_param = (p/100._r8)**2. + ! For an inverse gamma obs distribution, the likelihood is gamma. + if (x < 0._r8) then + write(errstring, *) 'x value is negative with an inverse gamma obs model ', x + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (y <= 0._r8) then + write(errstring, *) 'y value is non-positive with an inverse gamma obs model ', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (obs_param <= 0._r8) then + write(errstring, *) 'obs variance is non-positive with an inverse gamma obs model ', obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + inv_gamma_shape = (1._r8 / obs_param) + 2._r8 ! alpha + inv_gamma_scale = x * (inv_gamma_shape - 1._r8) ! beta + if (x .eq. 0._r8) then + l = 0._r8 ! Technically x must be > 0, but just setting l=0 is convenient. + else + l = (inv_gamma_scale**inv_gamma_shape) * exp( - inv_gamma_scale / y ) + end if + end if + case (obs_dist_types%lognormal) + ! For a lognormal obs distribution, ln(y) is normal with mean x and variance obs_param. + ! The likelihood is normal. + if (y <= 0._r8) then + write(errstring, *) 'y value is non-positive with a lognormal obs model', y + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + elseif (obs_param <= 0._r8) then + write(errstring, *) 'obs error sd <= 0', obs_param + call error_handler(E_MSG, 'kde_distribution_mod:likelihood', errstring, source) + else + l = exp( -0.5_r8 * (x - log(y))**2 / obs_param ) + end if + case (obs_dist_types%truncated_normal) + ! Code based on the version in assim_tools_mod.f90 + ! This implicitly assumes that the bounds on the y distribution are the same + ! as the bounds on x, which makes sense in observation space. The factor of + ! sigma * sqrt(2 * pi) is ignored. + ! A zero observation error variance is a degenerate case + if(obs_param <= 0.0_r8) then + write(errstring, *) 'obs error sd <= 0', obs_param + call error_handler(E_ERR, 'kde_distribution_mod:likelihood', errstring, source) + endif + + cdf = [0._r8, 1._r8] + if (present(bounded_above)) then + bounded_above_val = bounded_above + else + bounded_above_val = .false. + end if + + if (present(bounded_below)) then + bounded_below_val = bounded_below + else + bounded_below_val = .false. + end if + + if (bounded_below_val) cdf(1) = normal_cdf(lower_bound, x, sqrt(obs_param)) + if (bounded_above_val) cdf(2) = normal_cdf(upper_bound, x, sqrt(obs_param)) + + l = exp( -0.5_r8 * (x - y)**2 / obs_param ) / (cdf(2) - cdf(1)) + case DEFAULT + write(errstring, *) 'likelihood called with unrecognized obs_dist_type ', obs_dist_type + call error_handler(E_ERR, 'kde_distribution_mod:likelihood_function', errstring, source) + end select + +end function likelihood_function + +!--------------------------------------------------------------------------- + +elemental function epanechnikov_kernel(x) + real(r8) :: epanechnikov_kernel + real(r8), intent(in) :: x + real(r8), parameter :: norm_const = 3._r8 / 4._r8 + epanechnikov_kernel = norm_const * max(0._r8, 1._r8 - x**2) + +end function epanechnikov_kernel + +!--------------------------------------------------------------------------- + +elemental function epanechnikov_cdf(x) + real(r8) :: epanechnikov_cdf + real(r8), intent(in) :: x + real(r8), parameter :: norm_const = 1._r8 / 4._r8 + if (x <= -1._r8) then + epanechnikov_cdf = 0._r8 + elseif (x <= 1._r8) then + epanechnikov_cdf = min(1._r8, max(0._r8, norm_const * (2._r8 + 3._r8 * x - x**3))) + else + epanechnikov_cdf = 1._r8 + end if + +end function epanechnikov_cdf + +!--------------------------------------------------------------------------- + +elemental subroutine boundary_correction(x, lx, mx) + real(r8), intent(in) :: x + real(r8), intent(out) :: lx, mx + + ! Boundary correction for kde, from Jones (1993) and Jones & Foster (1996) + + real(r8) :: denom + + denom = ((1._r8 + x)**4) * (19._r8 + 3._r8 * x * (x - 6._r8)) + lx =-64._r8 * (-2._r8 + x * (4._r8 + 3._r8 * x * (x - 2._r8))) / denom + mx = 240._r8 * (x - 1._r8)**2 / denom + +end subroutine boundary_correction + +!--------------------------------------------------------------------------- + +function kde_pdf(x, p) + real(r8) :: kde_pdf + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + + ! Returns the kernel estimate of the pdf evaluated at x. + + integer :: i + real(r8) :: u_lower, lx_lower, mx_lower + real(r8) :: u_upper, lx_upper, mx_upper + + kde_pdf = 0._r8 ! Initialize + if (p%bounded_below .and. (x <= p%lower_bound)) return + if (p%bounded_above .and. (x >= p%upper_bound)) return + do i=1, p%ens_size ! Reduction loop - parallelizable + u_lower = 0._r8; lx_lower = 1._r8 ; mx_lower = 0._r8 + u_upper = 0._r8; lx_upper = 1._r8 ; mx_upper = 0._r8 + if (p%bounded_below) then ! Bounded below + u_lower = min( 1._r8, max( 0._r8, (x - p%lower_bound) / p%more_params(i) ) ) ! p%more_params(i) holds kernel width for ensemble member i + call boundary_correction(u_lower, lx_lower, mx_lower) + end if + if (p%bounded_above) then ! Bounded above + u_upper = min( 1._r8, max( 0._r8, (p%upper_bound - x) / p%more_params(i) ) ) + call boundary_correction(u_upper, lx_upper, mx_upper) + end if + u_lower = (x - p%ens(i)) / p%more_params(i) ! Not u_lower any more, just (x-x_i)/h_i + kde_pdf = kde_pdf + (1._r8 / p%more_params(i)) * & + (lx_lower + mx_lower * u_lower) * & + (lx_upper - mx_upper * u_lower) * & + epanechnikov_kernel( u_lower ) + end do + kde_pdf = max(0._r8, kde_pdf) / (p%ens_size * p%more_params(p%ens_size + 1)) ! p%more_params(ens_size + 1) normalizes the pdf + +end function kde_pdf + +!----------------------------------------------------------------------- + +subroutine get_kde_bandwidths(ens_size, ens, bandwidths) + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + real(r8), intent(out) :: bandwidths(ens_size) + + real(r8) :: ens_mean, ens_sd, h0, g, d_max + real(r8), dimension(ens_size) :: f_tilde, d, lambda + integer :: i, j, k + + + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d(:)) + if (d_max <= 0._r8) then + errstring = 'Bandwidth = 0 ' + call error_handler(E_ERR, 'get_kde_bandwidths', errstring, source) + end if + ens_mean = sum(ens) / ens_size + ens_sd = sqrt( sum( (ens - ens_mean)**2 ) / (ens_size - 1._r8) ) + h0 = 2._r8 * ens_sd / (ens_size**0.2_r8) ! This would be the kernel width if the widths were not adaptive. + ! It would be better to use min(sd, iqr/1.34) but don't want to compute iqr + k = floor( sqrt( real(ens_size) ) ) ! distance to kth nearest neighbor used to set bandwidth + do i=1,ens_size + d(:) = sort( abs( ens(:) - ens(i) ) ) + d_max = maxval(d(:)) + where (d < 1.e-3_r8 * d_max) ! set small distances to zero + d = 0._r8 + end where + j = 1 + do while ((d(k+j) <= 0._r8) .and. (k+j < ens_size)) + j = j + 1 + end do + f_tilde(i) = 0.5_r8 * real(k+j-1, r8) / (real(ens_size, r8) * d(k+j)) ! Initial density estimate + end do + f_tilde(:) = f_tilde(:) / maxval(f_tilde(:)) ! Avoids overflow in the next line + g = product( f_tilde )**( 1._r8 / real(ens_size, r8) ) + lambda(:) = sqrt( g / f_tilde(:) ) + bandwidths(:) = h0 * lambda(:) + if (maxval(bandwidths(:)) <= 0._r8) then + write(*,*) 'Bandwidth breakdown' + write(*,*) h0, g, lambda(:), f_tilde(:), ens(:) + end if + +end subroutine get_kde_bandwidths + +!--------------------------------------------------------------------------- + +function gq5(left, right, p) result(q) + real(r8) :: q + real(r8), intent(in) :: left, right + type(distribution_params_type), intent(in) :: p + + ! Uses 5-point Gauss quadrature to approximate \int_left^right l(s; y) p(s) ds + ! where p(x) is the prior pdf and l(x; y) is the likelihood. This only works + ! correctly if left >= lower_bound and right <= upper_bound. The result is + ! **Not Normalized**. + + real(r8) :: y + real(r8) :: obs_param ! See likelihood function for interpretation + integer :: obs_dist_type ! See likelihood function for interpretation + real(r8) :: xi ! quadrature point + real(r8), save :: chi(5) = [-sqrt(5._r8 + 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8, & + -sqrt(5._r8 - 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8, & + 0._r8, & + sqrt(5._r8 - 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8, & + sqrt(5._r8 + 2._r8 * sqrt(10._r8 / 7._r8)) / 3._r8] ! Gauss quadrature points + real(r8), save :: w(5) = [(322._r8 - 13._r8 * sqrt(70._r8)) / 900._r8, & + (322._r8 + 13._r8 * sqrt(70._r8)) / 900._r8, & + 128._r8/225._r8, & + (322._r8 + 13._r8 * sqrt(70._r8)) / 900._r8, & + (322._r8 - 13._r8 * sqrt(70._r8)) / 900._r8] ! GQ weights + real(r8) :: l ! value of the likelihood function + integer :: k + + ! Unpack obs info from param struct + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + obs_dist_type = nint(p%more_params(p%ens_size + 4)) + + q = 0._r8 + do k=1,5 + xi = 0.5_r8 * ((right - left) * chi(k) + left + right) + if (obs_dist_type .eq. obs_dist_types%truncated_normal) then + l = likelihood_function(xi, y, obs_param, obs_dist_type, & + bounded_above=p%bounded_above, bounded_below=p%bounded_below, & + upper_bound=p%upper_bound, lower_bound=p%lower_bound) + else + l = likelihood_function(xi, y, obs_param, obs_dist_type) + end if + q = q + 0.5_r8 * (right - left) * w(k) * kde_pdf(xi, p) * l + end do + +end function gq5 + +!--------------------------------------------------------------------------- + +function integrate_pdf(x, p) result(q) + real(r8) :: q + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + + ! Uses quadrature to approximate \int_{-\infty}^x l(x; y) p(s) ds where + ! p(s) is the prior pdf and l(x; y) is the likelihood. The interval is + ! broken up into sub-intervals whose boundaries are either one of the bounds, + ! or the edge of support of one of the kernels, or the value of x. On each + ! sub-interval the integral is approximated using Gauss-Legendre quadrature + ! with 5 points. When the likelihood is flat and the boundaries are far + ! from the ensemble, the result is exact up to roundoff error. + + real(r8) :: y + real(r8) :: obs_param ! See likelihood function for interpretation + integer :: obs_dist_type ! See likelihood function for interpretation + real(r8) :: edges(2*p%ens_size) + real(r8) :: left, right ! edges of current sub-interval, quadrature point + integer :: i + + ! Unpack obs info from param struct + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + obs_dist_type = nint(p%more_params(p%ens_size + 4)) + + edges(1:p%ens_size) = p%ens(1:p%ens_size) - p%more_params(1:p%ens_size) + edges(p%ens_size + 1:2*p%ens_size) = p%ens(1:p%ens_size) + p%more_params(1:p%ens_size) + edges(:) = sort(edges(:)) ! If bandwidths were constant we would not need to sort + + ! If x is outside the support of the pdf then we can skip the quadrature. + left = edges(1) + if (p%bounded_below) left = max(left, p%lower_bound) + if (x <= left) then + q = 0._r8 + return + end if + + ! Important to use x > upper_bound here because I use + ! x = upper_bound to compute the normalization constant. + if ((p%bounded_above) .and. (x > p%upper_bound)) then + q = 1._r8 + return + end if + + ! If we haven't returned yet, then there is at least one subinterval. + i = 1 + right = min(x, edges(2)) ! left was computed above + q = gq5(left, right, p) + do while ((x > right) .and. (i+1 < 2*p%ens_size)) + i = i + 1 + left = right + right = min(x, edges(i+1)) + q = q + gq5(left, right, p) + end do + ! Note that it is possible to have maxval(edges) < x < upper_bound, + ! but that last sub-interval from maxval(edges) to x has zero integral, + ! so it can be safely skipped. + +end function integrate_pdf + +!----------------------------------------------------------------------- + +subroutine pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, upper_bound, & + ens, y, obs_param, obs_dist_type, p) + integer, intent(in) :: ens_size + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(in) :: ens(ens_size) + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + integer, intent(in) :: obs_dist_type + type(distribution_params_type), intent(out) :: p + + real(r8) :: bandwidths(ens_size) + real(r8) :: edge, edges(2*ens_size + 2) + integer :: i + + ! Set the fixed storage parameters in the distribution_params_type + p%ens_size = ens_size + p%distribution_type = KDE_DISTRIBUTION + + ! Allocate space needed for the parameters + allocate(p%ens(1:ens_size), source=0._r8) + allocate(p%more_params(1:5*ens_size + 8), source=0._r8) + + ! p%more_params(1:ens_size) are the kernel bandwidths + ! p%more_params(ens_size + 1) is the normalization constant for the pdf + ! p%more_params(ens_size + 2) is the observation value y; not used for prior + ! p%more_params(ens_size + 3) is the observation parameter (see likelihood for details); not used for prior + ! p%more_params(ens_size + 4) is the observation error distribution type. For prior set to uninformative + ! p%more_params(ens_size + 5:3*ens_size + 6) are the edges. + ! p%more_params(3*ens_size+7:5*ens_size+8) are the cdf evaluated at the edges. + + ! Save the ensemble values, sorted now so that they don't have to be re-sorted for the first guess + p%ens(:) = sort(ens(:)) + + ! Store the kernel bandwidths in the more_params array + ! Important to make sure that p%ens and p%more_params are sorted in same order by passing p%ens rather than ens + call get_kde_bandwidths(ens_size, p%ens, bandwidths) + p%more_params(1:ens_size) = bandwidths(:) + + ! Pack obs information + p%more_params(ens_size + 2) = y + p%more_params(ens_size + 3) = obs_param + p%more_params(ens_size + 4) = real(obs_dist_type, r8) ! This is not ideal because it involves a type conversion from int to real + + ! Get the edges of the subintervals on which the pdf is smooth + edges(1:ens_size) = p%ens(:) - bandwidths(:) + edges(ens_size+1:2*ens_size) = p%ens(:) + bandwidths(:) + if (bounded_below) then + edges(2*ens_size+1) = lower_bound + else + edges(2*ens_size+1) = minval(edges(1:ens_size)) + end if + if (bounded_above) then + edges(2*ens_size+2) = upper_bound + else + edges(2*ens_size+2) = maxval(edges(ens_size+1:2*ens_size)) + end if + edges(:) = sort(edges(:)) + p%more_params(ens_size+5:3*ens_size+6) = edges(:) + + ! If the ensemble is sufficiently far from the boundary, that we can use the unbounded code, which is cheaper. + edge = minval(p%ens(:) - 2*bandwidths(:)) + if (bounded_below .and. (edge > lower_bound)) then + p%bounded_below = .false. + p%lower_bound = minval(p%ens(:) - bandwidths(:)) + else + p%bounded_below = bounded_below + p%lower_bound = lower_bound + end if + + ! If the ensemble is sufficiently far from the boundary, that we can use the unbounded code, which is cheaper. + edge = maxval(p%ens(:) + 2*bandwidths(:)) + if (bounded_above .and. (edge < upper_bound)) then + p%bounded_above = .false. + p%upper_bound = maxval(p%ens(:) + bandwidths(:)) + else + p%bounded_above = bounded_above + p%upper_bound = upper_bound + end if + + ! Integrate across all subintervals + p%more_params(ens_size+1) = 1._r8 ! Temporary value + p%more_params(3*ens_size+7) = 0._r8 ! 0 at left edge + do i=2,2*ens_size+2 + if ((p%bounded_below .and. (edges(i) <= p%lower_bound)) .or. & + (edges(i) <= edges(i-1)) .or. & + (p%bounded_above .and. (edges(i) > p%upper_bound))) then + ! The subinterval [edges(i-1), edges(i)] is empty or outside the bounds + p%more_params(3*ens_size+6+i) = p%more_params(3*ens_size+6+i-1) + 0._r8 + else + p%more_params(3*ens_size+6+i) = p%more_params(3*ens_size+6+i-1) + gq5(edges(i-1), edges(i), p) + end if + end do + p%more_params(ens_size+1) = maxval(p%more_params(3*ens_size+7:5*ens_size+8)) + p%more_params(3*ens_size+7:5*ens_size+8) = p%more_params(3*ens_size+7:5*ens_size+8) / p%more_params(ens_size+1) + +end subroutine pack_kde_params + +!----------------------------------------------------------------------- + +function kde_cdf_params(x, p) result(quantile) + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + real(r8) :: quantile + + ! Returns the cumulative distribution of the kernel density estimate + ! at the value x. + + integer :: i + logical :: use_analytical_cdf + ! It is a waste of memory to unpack p%more_params, but it aids readability + real(r8) :: bandwidths(p%ens_size) + real(r8) :: y, obs_param + integer :: obs_dist_type + real(r8), dimension(2*p%ens_size+2) :: edges, cdfs + + bandwidths(:) = p%more_params(1:p%ens_size) + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + obs_dist_type = nint(p%more_params(p%ens_size + 4)) + edges(:) = p%more_params( p%ens_size+5:3*p%ens_size+6) + cdfs(:) = p%more_params(3*p%ens_size+7:5*p%ens_size+8) + + quantile = 0._r8 + ! If the likelihood is uninformative and the distribution is unbounded, we can evaluate + ! the cdf analytically (instead of using quadrature) to save computation. + use_analytical_cdf = ( (.not.p%bounded_below) .and. (.not.p%bounded_above) .and. & + (obs_dist_type .eq. obs_dist_types%uninformative) ) + if (use_analytical_cdf) then + do i=1,p%ens_size + quantile = quantile + epanechnikov_cdf( (x - p%ens(i)) / bandwidths(i) ) / p%ens_size + end do + else ! Compute cdf using quadrature + !quantile = min(1._r8, max(0._r8, integrate_pdf(x, p))) + if (x < edges(1)) then + return + end if + do i=2,2*p%ens_size+2 + if ((edges(i-1) <= x) .and. (x <= edges(i))) then + quantile = min(1._r8, max(0._r8, cdfs(i-1) + gq5(edges(i-1), x, p))) + return + else + cycle + end if + end do + quantile = 1._r8 + end if + +end function kde_cdf_params + +!--------------------------------------------------------------------------- + +function kde_cdf(x, ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, y, obs_param, obs_dist_type) result(quantile) + real(r8), intent(in) :: x + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + integer, intent(in) :: obs_dist_type + real(r8) :: quantile + + ! Returns the cumulative distribution of the kernel density estimate + ! at the value x. + + type(distribution_params_type) :: p + + call pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, upper_bound, & + ens, y, obs_param, obs_dist_type, p) + quantile = kde_cdf_params(x,p) + +end function kde_cdf + +!--------------------------------------------------------------------------- + +function inv_kde_cdf(quantile, ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, y, obs_param, obs_dist_type) result(x) + real(r8), intent(in) :: quantile + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + integer, intent(in) :: obs_dist_type + real(r8) :: x + + ! Returns the value x such that cdf(x) = quantile. + + type(distribution_params_type) :: p + + call pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, upper_bound, & + ens, y, obs_param, obs_dist_type, p) + + x = inv_cdf(quantile, kde_cdf_params, inv_kde_first_guess_params, p) + +end function inv_kde_cdf + +!--------------------------------------------------------------------------- + +function inv_kde_cdf_params(quantile, p) result(x) + real(r8) :: x + real(r8), intent(in) :: quantile + type(distribution_params_type), intent(in) :: p + + ! Returns the value x such that cdf(x) = quantile. + + x = inv_cdf(quantile, kde_cdf_params, inv_kde_first_guess_params, p) + +end function inv_kde_cdf_params + +!--------------------------------------------------------------------------- + +function inv_kde_first_guess_params(quantile, p) result(x) + real(r8) :: x + real(r8), intent(in) :: quantile + type(distribution_params_type), intent(in) :: p + real(r8) :: edge ! edge of support of the cdf + + ! This first-guess subroutine evaluates the cdf at the ensemble members, + ! then finds a pair of ensemble members whose quantiles bracket the + ! target quantile, then sets the first guess to a convex combination of + ! these two ensemble members. If the target quantile is outside the + ! ensemble, the first guess is a combination of the nearest ensemble + ! member and the edge of the support of the pdf. If the + ! target quantile is 0 or 1, the appropriate bound is returned. + + integer :: i + real(r8) :: dq + real(r8), dimension(2*p%ens_size+2) :: edges, cdfs + + edges(:) = p%more_params( p%ens_size+5:3*p%ens_size+6) + cdfs(:) = p%more_params(3*p%ens_size+7:5*p%ens_size+8) + + if (quantile <= 0._r8) then + x = minval(edges(:)) + if (p%bounded_below) then + x = max(x, p%lower_bound) + end if + return + end if + + if (quantile >= 1._r8) then + x = maxval(edges(:)) + if (p%bounded_above) then + x = min(x, p%upper_bound) + end if + return + end if + + do i=2,2*p%ens_size+2 + if (quantile <= cdfs(i)) then + dq = cdfs(i) - cdfs(i-1) + x = (cdfs(i ) - quantile) * edges(i-1) / dq & + + (quantile - cdfs(i-1)) * edges(i ) / dq + return + end if + end do + +end function inv_kde_first_guess_params + +!----------------------------------------------------------------------- + +subroutine separate_ensemble(ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, ens_interior, ens_size_interior, & + q_lower, q_int, q_upper) + + ! Extracts the ensemble members that are not on the boundaries, and + ! also returns the fraction of ensemble members on each boundary and in + ! the interior. + + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(out) :: ens_interior(ens_size) + integer, intent(out) :: ens_size_interior + real(r8), intent(out) :: q_lower, q_int, q_upper + + ! local variables + integer :: i, j + logical :: is_interior(ens_size) + + q_lower = 0._r8 + q_int = 0._r8 + q_upper = 0._r8 + is_interior(:) = .true. + j = 0 + do i=1,ens_size + if (bounded_below) then + if (ens(i) <= lower_bound) then + q_lower = q_lower + 1._r8 + is_interior(i) = .false. + end if + end if + if (bounded_above) then + if (ens(i) >= upper_bound) then + q_upper = q_upper + 1._r8 + is_interior(i) = .false. + end if + end if + if (is_interior(i)) then + j = j + 1 + ens_interior(j) = ens(i) + end if + end do + ens_size_interior = ens_size - nint(q_lower + q_upper) + if (j .ne. ens_size_interior) then + errstring = 'Total number of interior ensemble members incorrect ' + call error_handler(E_ERR, 'separate_ensemble', errstring, source) + end if + q_lower = q_lower / real(ens_size, r8) + q_upper = q_upper / real(ens_size, r8) + q_int = 1._r8 - (q_lower + q_upper) + +end subroutine separate_ensemble + +!----------------------------------------------------------------------- + +subroutine obs_increment_kde(ens, ens_size, y, obs_param, bounded_below, & + bounded_above, lower_bound, upper_bound, obs_inc) + integer, intent(in) :: ens_size + real(r8), intent(in) :: ens(ens_size) + real(r8), intent(in) :: y + real(r8), intent(in) :: obs_param + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + real(r8), intent(out) :: obs_inc(ens_size) + + ! Applies a QCEF based on kernel density estimation & quadrature + ! Assumes a truncated normal likelihood whose bounds are the + ! same as the state variable. If the state variable is unbounded + ! then it uses a normal likelihood. Obviously this assumtion is + ! limiting. It could eventually be replaced with logic based on + ! obs_kind, which would be added as an input argument. + + real(r8) :: u + real(r8) :: p_lower_prior, p_int_prior, p_upper_prior + real(r8) :: p_lower_post, p_int_post, p_upper_post + real(r8) :: like_ens_mean ! ensemble mean of the likelihood + real(r8) :: ens_interior(ens_size) + real(r8) :: unif + real(r8) :: d(ens_size), d_max + type(distribution_params_type) :: params_interior_prior + type(distribution_params_type) :: params_interior_posterior + integer :: ens_size_interior + integer :: i, count_lower, count_int, count_upper + integer :: obs_dist_type + + if (bounded_below .or. bounded_above) then + obs_dist_type = obs_dist_types%truncated_normal + else + obs_dist_type = obs_dist_types%normal + endif + + ! If this is first time through, need to initialize the random sequence. + ! This will reproduce exactly for multiple runs with the same task count, + ! but WILL NOT reproduce for a different number of MPI tasks. + if (first_inc_ran_call) then + call random_seed() + call random_number(unif) + i = int(unif * 1000) + call init_random_seq(inc_ran_seq, my_task_id() + i) + first_inc_ran_call = .false. + endif + + ! If all ensemble members are identical, then there is no update + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d) + if(d_max .le. 0.0_r8) then + obs_inc(:) = 0._r8 + return + endif + + ! Get mixture component probabilities for the prior, and the interior ensemble, and its params + call separate_ensemble(ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, ens_interior, ens_size_interior, & + p_lower_prior, p_int_prior, p_upper_prior) + if (ens_size_interior .gt. 1) then + d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) + d_max = maxval(d(1:ens_size_interior)) + if (d_max .gt. 0._r8) then + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_types%uninformative, & + params_interior_prior) + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, upper_bound, & + ens_interior, y, obs_param, obs_dist_type, & + params_interior_posterior) + endif + else + d_max = 0._r8 + endif + + ! Get mixture component probabilities for the posterior + if (bounded_below) then + p_lower_post = p_lower_prior * likelihood_function(lower_bound, y, obs_param, obs_dist_type) + else + p_lower_post = 0._r8 + end if + if (bounded_above) then + p_upper_post = p_upper_prior * likelihood_function(upper_bound, y, obs_param, obs_dist_type) + else + p_upper_post = 0._r8 + end if + p_int_post = 0._r8 + do i=1,ens_size_interior + p_int_post = p_int_post + likelihood_function(ens_interior(i), y, obs_param, obs_dist_type) + end do + p_int_post = p_int_prior * p_int_post / real(ens_size_interior, r8) + ! Get prior ensemble mean of likelihood + like_ens_mean = p_lower_post + p_int_post + p_upper_post + ! If likelihood underflow, assume flat likelihood, so no increments + if(like_ens_mean .le. 0.0_r8) then + obs_inc(:) = 0.0_r8 + return + else ! Finish getting mixture component probabilities for the posterior + p_lower_post = p_lower_post / like_ens_mean + p_upper_post = p_upper_post / like_ens_mean + p_int_post = 1._r8 - p_lower_post - p_upper_post + end if + + ! Update ensemble members -> get observation increments + + count_lower = 0 + count_int = 0 + count_upper = 0 + unif = random_uniform(inc_ran_seq) / real(ens_size, r8) + do i=1,ens_size + ! Map each ensemble member to a probability u in [0,1] using the prior cdf. Members + ! on the boundary get random probabilities. + if (bounded_below .and. (ens(i) .le. lower_bound)) then + ! Rather than draw a random for each member on the boundary, + ! I draw one random (above) then add 1/Nb to each subsequent value + u = unif + real(count_lower, r8) / real(ens_size, r8) + count_lower = count_lower + 1 + elseif (bounded_above .and. (ens(i) .ge. upper_bound)) then + ! As above, I draw one random then add 1/Nb to each subsequent value + u = 1._r8 - (unif + real(count_upper, r8) / real(ens_size, r8)) + count_upper = count_upper + 1 + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! Can't use kde with only one ensemble member (or identical ensemble members), so assign a + ! random probability. As above, I draw one random then add 1/Nb to each subsequent value + u = unif + p_lower_prior + real(count_int, r8) / real(ens_size, r8) + count_int = count_int + 1 + else ! Use the interior cdf obtained using kde. + u = kde_cdf_params(ens(i), params_interior_prior) + u = (p_int_prior * u) + p_lower_prior + end if + ! Invert the posterior kde cdf to get the updated ensemble member, then + ! subtract to get the increment. + if (u .le. p_lower_post) then ! Posterior value on the lower boundary + obs_inc(i) = lower_bound - ens(i) + elseif (u .ge. (1._r8 - p_upper_post)) then ! Posterior value on the upper boundary + obs_inc(i) = upper_bound - ens(i) + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! posterior value in the interior, but there's only one prior ensemble member/value + ! in the interior (or prior ensemble members in the interior are identical), so we + ! just assign the posterior ensemble member equal to the prior one + obs_inc(i) = ens_interior(1) - ens(i) + else ! posterior value in the interior, can use kde + ! Rescale u to be between 0 and 1 before inverting interior cdf + u = (u - p_lower_post) / p_int_post + obs_inc(i) = inv_kde_cdf_params(u, params_interior_posterior) + obs_inc(i) = obs_inc(i) - ens(i) + end if + end do + +end subroutine obs_increment_kde + +!----------------------------------------------------------------------- + +subroutine test_kde + ! This routine provides limited tests of the numerics in this module. It tests + ! the boundary correction function, the likelihood, the bandwidth selection, + ! and the cdf inversion. It uses an ensemble [-1, 1]. It tests the bandwidth + ! selection and the cdf inversion with zero, one, or two bounds at [-2, 2]. + ! Failing these tests suggests a serious problem. Passing them does not indicate + ! that there are acceptable results for all possible inputs. + + integer, parameter :: ens_size = 2 + real(r8), dimension(ens_size) :: ensemble, bandwidths, target_bandwidths + type(distribution_params_type) :: p + real(r8) :: x, y, inv, max_diff, lx, mx, like, obs_param + integer :: i, obs_dist_type + + ! Test the boundary correction code against a Mathematica implementation + x = 0.5_r8 + call boundary_correction(x, lx, mx) + write(*, *) '----------------------------' + write(*, *) 'test boundary correction' + write(*, *) 'abs difference in lx is ', abs(1.322997416020672_r8 - lx) + write(*, *) 'abs difference in mx is ', abs(1.102497846683893_r8 - mx) + write(*, *) 'abs differences should be less than 1e-15' + if ((abs(1.322997416020672_r8 - lx) > 1E-15) .or. (abs(1.102497846683893_r8 - mx) > 1E-15)) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test uninformative likelihood + max_diff = -1.0_r8 + y = 1._r8 + obs_param = 1._r8 + obs_dist_type = obs_dist_types%uninformative + do i = 0, 1000 + x = (real(i, r8) - 500.0_r8) / 500.0_r8 + like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = max(abs(1._r8 - like), max_diff) + end do + write(*, *) '----------------------------' + write(*, *) 'Uninformative likelihood test' + write(*, *) 'max difference in likelihood is ', max_diff + write(*, *) 'max difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test normal likelihood + x = 0.5_r8 + y = 0._r8 + obs_param = 1._r8 + obs_dist_type = obs_dist_types%normal + like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.8824969025845955_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Normal likelihood test' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test binomial obs distribution (beta likelihood) + x = 0.25_r8 + y = 3._r8 + obs_param = 5._r8 + obs_dist_type = obs_dist_types%binomial + like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.0087890625_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Binomial obs distribution test' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test gamma obs distribution (inverse gamma likelihood) + y = 1._r8 + x = 2._r8 + obs_param = 3._r8 + obs_dist_type = obs_dist_types%gamma + like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.4658368455179406_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Gamma obs distribution test' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test inverse gamma obs distribution (gamma likelihood) + y = 3._r8 + x = 2._r8 + obs_param = 4._r8 + obs_dist_type = obs_dist_types%inv_gamma + like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(3.415489474106968_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Inverse gamma obs distribution test' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test lognormal obs distribution (normal likelihood) + y = 1._r8 + x = 2._r8 + obs_param = 4._r8 + obs_dist_type = obs_dist_types%lognormal + like = likelihood_function(x, y, obs_param, obs_dist_type) + max_diff = abs(0.6065306597126334_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Lognormal obs distribution test' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test truncated-normal obs distribution + y = 1._r8 + x = 2._r8 + obs_param = 4._r8 + obs_dist_type = obs_dist_types%truncated_normal + like = likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above=.true.,bounded_below=.true.,upper_bound=4._r8,lower_bound=0._r8) + max_diff = abs(1.292676850528392_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Truncated-normal obs distribution test, doubly-bounded' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + like = likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above=.false.,bounded_below=.true.,upper_bound=4._r8,lower_bound=0._r8) + max_diff = abs(1.048912359301403_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Truncated-normal obs distribution test, bounded below' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + like = likelihood_function(x, y, obs_param, obs_dist_type, & + bounded_above=.true.,bounded_below=.false.,upper_bound=4._r8,lower_bound=0._r8) + max_diff = abs(1.048912359301403_r8 - like) + write(*, *) '----------------------------' + write(*, *) 'Truncated-normal obs distribution test, bounded above' + write(*, *) 'abs difference in likelihood is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test bandwidth selection + target_bandwidths(:) = 2.462288826689833_r8 * [1._r8, 1._r8] + + ensemble(:) = [-1._r8, 1._r8] + + call get_kde_bandwidths(ens_size, ensemble, bandwidths) + + ! Compare computed bandwidths to exact + write(*, *) '----------------------------' + write(*, *) 'kde bandwidths test: Absolute value of differences should be less than 1e-15' + max_diff = 0._r8 + do i = 1, ens_size + max_diff = max(abs(bandwidths(i) - target_bandwidths(i)), max_diff) + write(*, *) i, abs(bandwidths(i) - target_bandwidths(i)) + end do + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + ! Test the inversion of the cdf over the support of the pdf, unbounded + write(*, *) '----------------------------' + write(*, *) 'Unbounded cdf/icdf test' + call pack_kde_params(ens_size, .false., .false., 0._r8, 0._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = (real(i - 500, r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max around 1E-8' + ! The accuracy degrades near the boundary. The slope of the cdf goes to zero on the boundary + ! But at least the second derivative is nonzero. So near the boundary it behaves like a + ! double root, and the accuracy of the rootfinding degrades. + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + call deallocate_distribution_params(p) + + ! Test the inversion of the cdf over the entire support of the pdf, bounded below + write(*, *) '----------------------------' + write(*, *) 'cdf/icdf test bounded below' + call pack_kde_params(ens_size, .true., .false., -2._r8, 0._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = (real(i - 500, r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + if (x <= -2._r8) then + cycle + else + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end if + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max below 1E-8' + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + call deallocate_distribution_params(p) + + + ! Test the inversion of the cdf over the entire support of the pdf, bounded above + write(*, *) '----------------------------' + write(*, *) 'cdf/icdf test bounded above' + call pack_kde_params(ens_size, .false., .true., 0._r8, 2._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = ((real(i, r8) - 500.0_r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + if (x >= 2._r8) then + cycle + else + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end if + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max below 1E-8' + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + call deallocate_distribution_params(p) + + ! Test the inversion of the cdf over the entire support of the pdf, doubly bounded + write(*, *) '----------------------------' + write(*, *) 'cdf/icdf test doubly-bounded' + call pack_kde_params(ens_size, .true., .true., -2._r8, 2._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + + max_diff = -1.0_r8 + do i = 0, 1000 + x = ((real(i, r8) - 500.0_r8) / 500.0_r8) * (1._r8 + target_bandwidths(1)) + if ((x <= -2._r8) .or. (x >= 2._r8)) then + cycle + else + y = kde_cdf_params(x, p) + inv = inv_kde_cdf_params(y, p) + ! write(*,*) 'Quantile: ', y, 'Exact x: ', x, 'Inverted x: ', inv, 'Error: ', abs(inv - x) + max_diff = max(abs(x-inv), max_diff) + end if + end do + + write(*, *) 'max difference in inversion is ', max_diff + write(*, *) 'Errors should be small, max below 1E-8' + if (max_diff > 1E-8_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + call deallocate_distribution_params(p) + + ! Test the quadrature: Construct a case with bounds, but where the bounds are + ! far enough from the data that they are not used. In this case the kernel + ! density estimate should integrate exactly to one. + call pack_kde_params(ens_size, .true., .true., -2._r8, 2._r8, & + ensemble, 0._r8, 1._r8, obs_dist_types%uninformative, p) + p%lower_bound = -20._r8 + p%upper_bound = 20._r8 + p%more_params(ens_size + 1) = 1._r8 + + y = integrate_pdf(0._r8, p) + max_diff = abs(0.5_r8 - y) + write(*, *) '----------------------------' + write(*, *) 'test quadrature' + write(*, *) 'abs difference is ', max_diff + write(*, *) 'abs difference should be less than 1e-15' + if (max_diff > 1E-15_r8) then + write(*, *) 'FAIL' + else + write(*, *) 'PASS' + endif + + call deallocate_distribution_params(p) + +end subroutine test_kde + +!------------------------------------------------------------------------ + +end module kde_distribution_mod diff --git a/assimilation_code/modules/assimilation/probit_transform_mod.f90 b/assimilation_code/modules/assimilation/probit_transform_mod.f90 index f0ea534a2..c5645c15a 100644 --- a/assimilation_code/modules/assimilation/probit_transform_mod.f90 +++ b/assimilation_code/modules/assimilation/probit_transform_mod.f90 @@ -18,7 +18,7 @@ module probit_transform_mod NORMAL_DISTRIBUTION, BOUNDED_NORMAL_RH_DISTRIBUTION, & GAMMA_DISTRIBUTION, BETA_DISTRIBUTION, & LOG_NORMAL_DISTRIBUTION, UNIFORM_DISTRIBUTION, & - PARTICLE_FILTER_DISTRIBUTION + PARTICLE_FILTER_DISTRIBUTION, KDE_DISTRIBUTION use normal_distribution_mod, only : normal_cdf, inv_std_normal_cdf @@ -31,6 +31,9 @@ module probit_transform_mod use bnrh_distribution_mod, only : bnrh_cdf_initialized_vector, bnrh_cdf_params, & inv_bnrh_cdf_params, get_bnrh_sd +use kde_distribution_mod, only : kde_cdf_params, inv_kde_cdf_params, pack_kde_params, & + obs_dist_types, separate_ensemble + implicit none private @@ -183,7 +186,9 @@ subroutine transform_to_probit(ens_size, state_ens_in, distribution_type, p, & endif !---------------------------------------------------------------------------------- - +elseif(p%distribution_type == KDE_DISTRIBUTION) then + call to_probit_kde(ens_size, state_ens, p, probit_ens, & + use_input_p, bounded_below, bounded_above, lower_bound, upper_bound) !!!elseif(p%distribution_type == PARTICLE_FILTER_DISTRIBUTION) then !!!call to_probit_particle(ens_size, state_ens, p, probit_ens, use_input_p, & !!!bounded_below, bounded_above, lower_bound, upper_bound) @@ -446,6 +451,100 @@ end subroutine to_probit_bounded_normal_rh !!! !------------------------------------------------------------------------ +subroutine to_probit_kde(ens_size, state_ens, p, probit_ens, use_input_p, & + bounded_below, bounded_above, lower_bound, upper_bound) + + ! Transforms the values in state_ens. The transform is either defined + ! by state_ens (when use_input_p is false), or by p%ens (when use_input_p + ! is true). Handles the case where ensemble members are on the boundary. + + integer, intent(in) :: ens_size + real(r8), intent(in) :: state_ens(ens_size) + type(distribution_params_type), intent(inout) :: p + real(r8), intent(out) :: probit_ens(ens_size) + logical, intent(in) :: use_input_p + logical, intent(in) :: bounded_below, bounded_above + real(r8), intent(in) :: lower_bound, upper_bound + + ! local variables + real(r8) :: u + integer :: i + real(r8) :: y = 0._r8 ! Dummy value, not used + real(r8) :: obs_param = 1._r8 ! Dummy value, not used + real(r8) :: ens(ens_size) ! Ensemble that defines the transform + real(r8) :: ens_interior(ens_size) ! Ensemble members that are not on the boundaries + integer :: ens_size_interior ! Number of ensemble members that are not on the boundaries + type(distribution_params_type) :: p_interior + real(r8) :: d(ens_size), d_max + real(r8) :: p_lower, p_int, p_upper + + ! Get the ensemble that defines the transform + if(use_input_p) then + ens(:) = p%ens(:) + else + ! The input ensemble (state_ens) defines the transform. + ens(:) = state_ens(:) + endif + + ! If all ensemble members that define the transform are identical, then we can't + ! really define the transform, so we simply set all probit values to 0, pack the + ! ensemble members into p, and return. + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d) + if(d_max .le. 0.0_r8) then + probit_ens(:) = 0._r8 + if(.not. use_input_p) then + allocate(p%ens(1:ens_size)) + p%ens(:) = ens(:) + endif + return + endif + + ! If we reach this point then the ensemble members that define the transform are + ! not all identical, and if we are not using the input p then we need to define it. + if (.not. use_input_p) then + call pack_kde_params(ens_size, bounded_below, bounded_above, lower_bound, & + upper_bound, ens, y, obs_param, obs_dist_types%uninformative, p) + endif + + ! Get mixture component probabilities and interior ensemble + call separate_ensemble(ens, ens_size, bounded_below, bounded_above, & + lower_bound, upper_bound, ens_interior, ens_size_interior, & + p_lower, p_int, p_upper) + + ! Get parameters of the kde distribution for the interior + if (ens_size_interior .gt. 1) then + d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) + d_max = maxval(d(1:ens_size_interior)) + if (d_max .gt. 0._r8) then + call pack_kde_params(ens_size_interior, bounded_below, bounded_above, lower_bound, & + upper_bound, ens_interior(1:ens_size_interior), y, obs_param, & + obs_dist_types%uninformative, p_interior) + endif + else + d_max = 0._r8 + endif + do i=1,ens_size + ! Map each state_ens member to a probability u using the prior cdf. Members + ! on the boundary map to p_lower and p_upper. + if (bounded_below .and. (state_ens(i) .le. lower_bound)) then + u = p_lower + elseif (bounded_above .and. (state_ens(i) .ge. upper_bound)) then + u = 1._r8 - p_upper + elseif ((ens_size_interior .le. 1) .or. (d_max .le. 0._r8)) then + ! Can't use kde with only one ensemble member, so assign to middle of interior range + u = (p_int * 0.5_r8) + p_lower + else ! Use the interior cdf obtained using kde. + u = kde_cdf_params(state_ens(i), p_interior) + u = (p_int * u) + p_lower + endif + ! Transform to probit/logit space + probit_ens(i) = probit_or_logit_transform(u) + end do + +end subroutine to_probit_kde + + subroutine transform_all_from_probit(ens_size, num_vars, probit_ens, p, state_ens) integer, intent(in) :: ens_size @@ -493,6 +592,8 @@ subroutine transform_from_probit(ens_size, probit_ens, p, state_ens) call from_probit_bounded_normal_rh(ens_size, probit_ens, p, state_ens) !!!elseif(p%distribution_type == PARTICLE_FILTER_DISTRIBUTION) then !!!call from_probit_particle(ens_size, probit_ens, p, state_ens) +elseif(p%distribution_type == KDE_DISTRIBUTION) then + call from_probit_kde(ens_size, probit_ens, p, state_ens) else write(errstring, *) 'Illegal distribution type', p%distribution_type call error_handler(E_ERR, 'transform_from_probit', errstring, source) @@ -643,6 +744,81 @@ end subroutine from_probit_bounded_normal_rh !------------------------------------------------------------------------ +subroutine from_probit_kde(ens_size, probit_ens, p, state_ens) + + ! Handles the case where ensemble members are on the boundary. + + integer, intent(in) :: ens_size + real(r8), intent(in) :: probit_ens(ens_size) + type(distribution_params_type), intent(inout) :: p + real(r8), intent(out) :: state_ens(ens_size) + + ! local variables + integer :: i + real(r8) :: u + real(r8) :: ens(ens_size) ! Ensemble that defines the transform + real(r8) :: ens_interior(ens_size) ! Ensemble members that are not on the boundaries + integer :: ens_size_interior ! Number of ensemble members that are not on the boundaries + type(distribution_params_type) :: p_interior + real(r8) :: p_lower, p_int, p_upper + real(r8) :: y, obs_param + real(r8) :: d(ens_size), d_max + + ! The ensemble that defines the transform is contained in p, so unpack it + ens(:) = p%ens(:) + + ! If all ensemble members are identical, then there is no update + d(:) = abs( ens(:) - ens(1) ) + d_max = maxval(d) + if(d_max .le. 0.0_r8) then + state_ens(:) = ens(1) + return + endif + ! Get mixture components on the boundaries and interior ensemble + call separate_ensemble(ens, ens_size, p%bounded_below, p%bounded_above, & + p%lower_bound, p%upper_bound, ens_interior, ens_size_interior, & + p_lower, p_int, p_upper) + + ! Get parameters of the kde distribution for the interior + if (ens_size_interior .gt. 1) then + d(1:ens_size_interior) = abs( ens_interior(1:ens_size_interior) - ens_interior(1) ) + d_max = maxval(d(1:ens_size_interior)) + if (d_max .gt. 0._r8) then + ! Unpack obs info from param struct + y = p%more_params(p%ens_size + 2) + obs_param = p%more_params(p%ens_size + 3) + call pack_kde_params(ens_size_interior, p%bounded_below, p%bounded_above, p%lower_bound, & + p%upper_bound, ens_interior(1:ens_size_interior), y, obs_param, & + obs_dist_types%uninformative, p_interior) + endif + else + d_max = 0._r8 + endif + + ! Transform each probit ensemble member back to physical space + do i = 1, ens_size + ! First, invert the probit/logit to get probabilities u + u = inv_probit_or_logit_transform(probit_ens(i)) + ! Next invert the mixture CDF to get the physical space ensemble + if (p%bounded_below .and. (u .le. p_lower)) then + state_ens(i) = p%lower_bound + elseif (p%bounded_above .and. (u .ge. 1._r8-p_upper)) then + state_ens(i) = p%upper_bound + elseif ((ens_size_interior .eq. 1) .or. (d_max .le. 0._r8)) then + ! If there is only one interior ensemble member in the prior then any + ! ensemble member that moves into the interior just gets mapped to the + ! one interior value from the prior. + state_ens(i) = ens_interior(1) + else + u = (u - p_lower) / p_int + state_ens(i) = inv_kde_cdf_params(u, p_interior) + endif + end do + +end subroutine from_probit_kde + +!------------------------------------------------------------------------ + function probit_or_logit_transform(quantile) real(r8) :: probit_or_logit_transform diff --git a/assimilation_code/modules/assimilation/rootfinding_mod.f90 b/assimilation_code/modules/assimilation/rootfinding_mod.f90 new file mode 100644 index 000000000..b0db8a2c1 --- /dev/null +++ b/assimilation_code/modules/assimilation/rootfinding_mod.f90 @@ -0,0 +1,274 @@ +! DART software - Copyright UCAR. This open source software is provided +! by UCAR, "as is", without charge, subject to all terms of use at +! http://www.image.ucar.edu/DAReS/DART/DART_download + +! This module originally written by I. Grooms and described (briefly) +! in "A Quantile-Conserving Ensemble Filter Based on Kernel-Density Estimation" +! by Grooms & Riedel (Remote Sensing, 2024; DOI:10.3390/rs16132377). + +module rootfinding_mod + +use types_mod, only : r8 + +use utilities_mod, only : E_ERR, E_MSG, E_ALLMSG, error_handler + +use distribution_params_mod, only : distribution_params_type, deallocate_distribution_params + +implicit none +private + +public :: inv_cdf + +character(len=512) :: errstring +character(len=*), parameter :: source = 'rootfinding_mod.f90' + +contains + +!------------------------------------------------------------------------ + +function inv_cdf(cdf_in, cdf, first_guess, p) result(quantile) + + interface + function cdf(x, p) + use types_mod, only : r8 + use distribution_params_mod, only : distribution_params_type + real(r8) :: cdf + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + end function + end interface + + interface + function first_guess(u, p) + use types_mod, only : r8 + use distribution_params_mod, only : distribution_params_type + real(r8) :: first_guess + real(r8), intent(in) :: u + type(distribution_params_type), intent(in) :: p + end function + end interface + + real(r8) :: quantile + real(r8), intent(in) :: cdf_in + type(distribution_params_type), intent(in) :: p + + ! Given a probability cdf_in, finds the value of x (i.e. the quantile) for which cdf(x) + ! is approximately cdf_in = u. + ! Starts with Newton's method using an approximate derivative until either convergence + ! or overshoot. If it overshoots then it switches to the bisection-like ITP method + ! from Oliveira & Takahashi, ACM Trans Math Soft, 2020. Here f(x) = cdf(x) - cdf_in + + ! Local variables: + integer, parameter :: MAX_ITERATIONS = 50 ! Limit on the total number of iterations. + real(r8), parameter :: MIN_PROBABILITY = 0.0_r8, MAX_PROBABILITY = 0.999999999999999_r8 + real(r8), parameter :: TOL = epsilon(1._r8)**(0.75_r8) ! Absolute on q, relative on x + real(r8) :: u, u_err + real(r8) :: delta_x, delta_f + real(r8) :: x_guess, u_guess, x0, x1, f0, f1 + real(r8) :: a, b, fa, fb + integer :: iter + + real(r8) :: lower_bound, upper_bound + logical :: bounded_below, bounded_above + + ! Extract the required information from the p type + bounded_below = p%bounded_below; bounded_above = p%bounded_above + lower_bound = p%lower_bound; upper_bound = p%upper_bound + + u = cdf_in + + ! Do a test for illegal values on the probability u + if(u < 0.0_r8 .or. u > 1.0_r8) then + write(errstring, *) 'Illegal cdf input', u + call error_handler(E_ERR, 'inv_cdf', errstring, source) + endif + + ! If the distribution is bounded, probabilities of 0 and 1 have values at the bounds + if(bounded_below .and. u == 0.0_r8) then + quantile = lower_bound + return + endif + if(bounded_above .and. u == 1.0_r8) then + quantile = upper_bound + return + endif + + ! If input probabilities are outside the numerically supported range, move them to the extremes + u = min(u, MAX_PROBABILITY) + ! code tests stably for many distributions with min_probability of 0.0, could remove this + u = max(u, MIN_PROBABILITY) + + ! Get first guess from functional approximation + x_guess = first_guess(u, p) + + ! Evaluate the cdf + u_guess = cdf(x_guess, p) + + ! Check if first guess is close enough + u_err = u - u_guess + if (abs(u_err) .le. TOL) then + quantile = x_guess + return + end if + + ! If bounded below and target probability is between 0 and u_guess, go to ITP + if (bounded_below .and. (u .lt. u_guess)) then + a = lower_bound + b = x_guess + fa = 0._r8 - u + fb = u_guess - u + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, MAX_ITERATIONS, p) + return + end if + + ! If bounded above and target probability is between u_guess and 1, go to ITP + if (bounded_above .and. (u_guess .lt. u)) then + a = x_guess + b = upper_bound + fa = u_guess + fb = 1._r8 + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, MAX_ITERATIONS, p) + return + end if + + ! If we reach this line, then we can't yet bracket the true value of x, so we start + ! doing steps of the secant method, either until convergence or until we bracket + ! the root. If we bracket the root then we finish using the ITP method. + x0 = x_guess + f0 = u_guess - u + delta_x = max(1e-2_r8, 1e-2_r8 * abs(x_guess)) + if (f0 .gt. 0._r8) then + delta_x = -delta_x + end if + do iter=1,7 + x1 = x_guess + delta_x + if(bounded_below .and. (x1 .lt. lower_bound)) x1 = lower_bound + if(bounded_above .and. (x1 .gt. upper_bound)) x1 = upper_bound + f1 = cdf(x1, p) - u + delta_f = abs(f1 - f0) + if (delta_f .le. 0._r8) then + delta_x = 2._r8 * delta_x + else + exit + end if + end do + do iter = 1, MAX_ITERATIONS + ! If f0 * f1 < 0 then x0 and x1 bracket the root, so go to ITP + if (f0 * f1 .lt. 0._r8 ) then + a = min(x0, x1) + b = max(x0, x1) + fa = min(f0, f1) + fb = max(f0, f1) + quantile = inv_cdf_ITP(cdf, u, a, b, fa, fb, MAX_ITERATIONS - iter + 1, p) + return + else ! Do a secant step + delta_f = f1 - f0 + if (abs(delta_f) .eq. 0._r8) then ! Stop. Failed step: Flat CDF + write(errstring, *) 'Failed to converge; flat cdf' + write(*,*) p%bounded_below, p%bounded_above + write(*,*) 'More params:', p%more_params(:) + write(*,*) 'iter, x0, x1, f0, f1:', iter, x0, x1, f0, f1 + write(*,*) 'ens:', p%ens(:) + call error_handler(E_MSG, 'inv_cdf:', errstring, source) + quantile = x1 + return + else + x_guess = x1 - f1 * delta_x / delta_f + if (bounded_above .and. (x_guess .gt. upper_bound)) x_guess = upper_bound + if (bounded_below .and. (x_guess .lt. lower_bound)) x_guess = lower_bound + x0 = x1 + f0 = f1 + x1 = x_guess + delta_x = x1 - x0 + f1 = cdf(x1, p) - u + quantile = x1 + end if + end if + + ! Finished secant step. Check for convergence. + if (abs(delta_x) .le. TOL * max(abs(x0), abs(x1)) .or. (abs(f1) .le. TOL)) then + return + endif + end do + + ! For now, have switched a failed convergence to return the latest guess + ! This has implications for stability of probit algorithms that require further study + ! Not currently happening for any of the test cases on gfortran + write(errstring, *) 'Failed to converge for probability ', u + call error_handler(E_ALLMSG, 'inv_cdf', errstring, source) + !!!call error_handler(E_ERR, 'inv_cdf', errstring, source) + +end function inv_cdf + +!----------------------------------------------------------------------- + +function inv_cdf_ITP(cdf, u, a, b, fa, fb, max_iterations, p) result(x) + interface + function cdf(x, p) + use types_mod, only : r8 + use distribution_params_mod, only : distribution_params_type + real(r8) :: cdf + real(r8), intent(in) :: x + type(distribution_params_type), intent(in) :: p + end function + end interface + + real(r8), intent(in) :: u + real(r8), intent(inout) :: a, b, fa, fb + integer, intent(in) :: max_iterations + type(distribution_params_type), intent(in) :: p + real(r8) :: x + + ! Given a probability u, finds the value of x for which cdf(x) is approximately u + ! Uses the bisection-like ITP method from Oliveira & Takahashi, ACM Trans Math + ! Soft, 2020. Here f(x) = cdf(x) - u. Assumes f(a) < 0 and f(b) > 0 on input. + + ! Local variables: + integer, parameter :: N0 = 1._r8 + real(r8), parameter :: KAPPA2 = 2._r8 + real(r8), parameter :: TOL = epsilon(1._r8)**(0.75_r8) ! abs on f, rel on x + real(r8) :: kappa1 + real(r8) :: delta + real(r8) :: x_t, x_f, x_half, f_ITP + real(r8) :: eps, r + integer :: i, n_max, n_half + + kappa1 = 0.2_r8 / (b - a) + eps = TOL * max(abs(a), abs(b)) + ! Max number of iterations is either (i) input max, or (ii) number of bisection + ! iterations (plus N0) needed to achieve the desired relative tolerance on x. + n_half = max(0, ceiling(log(0.5_r8 * (b - a) / eps) / log(2._r8))) + n_max = min(max_iterations, N0 + n_half) + + do i=1,n_max + x_half = 0.5_r8 * (a + b) ! Bisection guess + x_f = (b* fa - a * fb) / (fa - fb) ! Secant/Regula-Falsi guess + delta = kappa1 * abs(b - a)**KAPPA2 + if (delta .le. abs(x_half - x_f)) then + x_t = x_f + sign(delta, x_half - x_f) + else + x_t = x_half + end if + r = max(0._r8, eps * 2**(n_half - i) - 0.5_r8 * (b - a)) + if (abs(x_t - x_half) .le. r) then + x = x_t + else + x = x_half - sign(r, x_half - x_f) + end if + f_ITP = cdf(x, p) - u + if (abs(f_ITP) .le. TOL) then + return + elseif (f_ITP .gt. 0._r8) then + b = x + fb = f_ITP + else + a = x + fa = f_ITP + end if + end do + +end function inv_cdf_ITP + +!----------------------------------------------------------------------- + +end module rootfinding_mod diff --git a/conf.py b/conf.py index fdfe8997c..c0c136b51 100644 --- a/conf.py +++ b/conf.py @@ -21,7 +21,7 @@ author = 'Data Assimilation Research Section' # The full version, including alpha/beta/rc tags -release = '11.6.1' +release = '11.7.0' root_doc = 'index' # -- General configuration --------------------------------------------------- diff --git a/developer_tests/kde_dist/test_kde_dist.f90 b/developer_tests/kde_dist/test_kde_dist.f90 new file mode 100644 index 000000000..8e4d1e9e6 --- /dev/null +++ b/developer_tests/kde_dist/test_kde_dist.f90 @@ -0,0 +1,12 @@ +! DART software - Copyright UCAR. This open source software is provided +! by UCAR, "as is", without charge, subject to all terms of use at +! http://www.image.ucar.edu/DAReS/DART/DART_download + +program test_kde_mod + use utilities_mod, only : initialize_utilities, finalize_utilities + use kde_distribution_mod, only : test_kde + call initialize_utilities() + call test_kde() + call finalize_utilities() + +end program test_kde_mod diff --git a/developer_tests/kde_dist/work/input.nml b/developer_tests/kde_dist/work/input.nml new file mode 100644 index 000000000..ab9ab5619 --- /dev/null +++ b/developer_tests/kde_dist/work/input.nml @@ -0,0 +1,12 @@ +&utilities_nml + module_details = .false. + / + +&preprocess_nml + input_obs_qty_mod_file = '../../../assimilation_code/modules/observations/DEFAULT_obs_kind_mod.F90' + output_obs_qty_mod_file = '../../../assimilation_code/modules/observations/obs_kind_mod.f90' + input_obs_def_mod_file = '../../../observations/forward_operators/DEFAULT_obs_def_mod.F90' + output_obs_def_mod_file = '../../../observations/forward_operators/obs_def_mod.f90' + obs_type_files = '../../../observations/forward_operators/obs_def_gps_mod.f90' + quantity_files = '../../../assimilation_code/modules/observations/default_quantities_mod.f90' + / diff --git a/developer_tests/kde_dist/work/quickbuild.sh b/developer_tests/kde_dist/work/quickbuild.sh new file mode 100755 index 000000000..94912c002 --- /dev/null +++ b/developer_tests/kde_dist/work/quickbuild.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# DART software - Copyright UCAR. This open source software is provided +# by UCAR, "as is", without charge, subject to all terms of use at +# http://www.image.ucar.edu/DAReS/DART/DART_download + +main() { + + +export DART=$(git rev-parse --show-toplevel) +source "$DART"/build_templates/buildfunctions.sh + +MODEL="none" +EXTRA="$DART"/models/template/threed_model_mod.f90 +dev_test=1 +LOCATION="threed_sphere" +TEST="kde_dist" + +serial_programs=( +test_kde_dist +) + +# quickbuild arguments +arguments "$@" + +# clean the directory +\rm -f -- *.o *.mod Makefile .cppdefs + +# build and run preprocess before making any other DART executables +buildpreprocess + +# build DART +buildit + +# clean up +\rm -f -- *.o *.mod + +} + +main "$@" diff --git a/guide/qceff_probit.rst b/guide/qceff_probit.rst index dddd4fbf7..c46cf47bd 100644 --- a/guide/qceff_probit.rst +++ b/guide/qceff_probit.rst @@ -4,8 +4,8 @@ Quantile-Conserving Ensemble Filter Framework ============================================== The Quantile-Conserving Ensemble Filter Framework (QCEFF) tools are available in DART -as of version v11. -The DART development team (dart@ucar.edu) would be happy to hear about your experiences +as of version v11. +The DART development team (dart@ucar.edu) would be happy to hear about your experiences and is anxious to build scientific collaborations using these new capabilities. The QCEFF options are set using a :ref:`qceff table ` given as a namelist option to &algorithm_info_nml. @@ -21,7 +21,7 @@ The QCEFF options are set using a :ref:`qceff table ` given as a na QCEFF options -------------- -QCEFF options are per quantity. For a given quantity, you specify the following +QCEFF options are per quantity. For a given quantity, you specify the following options as columns of the qceff_table: * Observation error information @@ -29,19 +29,19 @@ options as columns of the qceff_table: Provides information about boundedness constraints that control the likelihood distribution associated with an observed variable when using perfect_model_obs to generate noisy observations. - - * bounded_below (default .false.) + + * bounded_below (default .false.) * bounded_above (default .false.) - * lower_bound + * lower_bound * upper_bound -* Probit distribution information +* Probit distribution information Used in the computation of the probit transform. The values needed are the bounds and the distribution type. These can be different for all three cases (inflation, state, and extended_state priors) - + * distribution (one of :ref:`Distributions`) * bounded_below (default .false.) * bounded_above (default .false.) @@ -65,17 +65,17 @@ Creating a qceff table ----------------------- The table has two headers, row 1 and 2. -The first row is the version number. The second row describes the :ref:`QCEFF options` corresponding to each column of the table. -These two headers must be present in your qceff table. -The :ref:`qcf trunc table` below shows the table headers, -and an example quantity QTY_TRACER_CONCENTRATION for the first 5 columns of the table. +The first row is the version number. The second row describes the :ref:`QCEFF options` corresponding to each column of the table. +These two headers must be present in your qceff table. +The :ref:`qcf trunc table` below shows the table headers, +and an example quantity QTY_TRACER_CONCENTRATION for the first 5 columns of the table. There is a complete table with all 25 columns in `Google Sheets `_. You can copy and edit this table as needed. To add a quantity, add a row to the table. -For any quantity not listed in the table, the :ref:`Default values` values will be used for all 25 options. +For any quantity not listed in the table, the :ref:`Default values` values will be used for all 25 options. You only have to add rows for quantities that use non-default values for any of the input options. Ensure that there are no empty rows in between the quantities listed in the spreadsheet. -Save your spreadsheet as a .csv file. +Save your spreadsheet as a .csv file. To run filter or perfect_model_obs, put the .csv file in the directory where you are running. Edit input.nml to set the algorithm_info_nml option qceff_table_filename, for example: @@ -89,14 +89,14 @@ Edit input.nml to set the algorithm_info_nml option qceff_table_filename, for ex .. _qcf trunc table: -.. list-table:: truncated table +.. list-table:: truncated table :header-rows: 2 * - QCF table version: 1 - - - - - - - - + - + - + - + - * - QTY - bounded_below - bounded_above @@ -119,6 +119,7 @@ Available filter kinds * UNBOUNDED_RHF * GAMMA_FILTER * BOUNDED_NORMAL_RHF + * KDE_FILTER .. _Distributions: @@ -127,11 +128,11 @@ Available distributions * NORMAL_DISTRIBUTION (default) * BOUNDED_NORMAL_RH_DISTRIBUTION - * GAMMA_DISTRIBUTION + * GAMMA_DISTRIBUTION * BETA_DISTRIBUTION * LOG_NORMAL_DISTRIBUTION * UNIFORM_DISTRIBUTION - + * KDE_DISTRIBUTION .. _Default values: