Skip to content

Commit

Permalink
Merge pull request #78 from BerkeleyLab/dr-integrate-adam
Browse files Browse the repository at this point in the history
Add Adam optimizer and stochastic gradient descent
  • Loading branch information
rouson authored Sep 6, 2023
2 parents fc68d78 + c77206f commit 6049804
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 31 deletions.
61 changes: 44 additions & 17 deletions app/train-cloud-microphysics.f90
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ program train_cloud_microphysics
!! https://github.com/BerkeleyLab/icar.

!! External dependencies:
use sourcery_m, only : string_t, file_t, command_line_t
use sourcery_m, only : string_t, file_t, command_line_t, bin_t
use assert_m, only : assert, intrinsic_array_t
use ieee_arithmetic, only : ieee_is_nan
use iso_fortran_env, only : int64, real64
Expand Down Expand Up @@ -71,7 +71,7 @@ program train_cloud_microphysics
double precision, allocatable, dimension(:) :: time_in, time_out
double precision, parameter :: tolerance = 1.E-07
integer, allocatable :: lbounds(:)
integer t_end, t
integer t_end, t, b

print *,"Reading network inputs from " // network_input

Expand Down Expand Up @@ -148,14 +148,15 @@ program train_cloud_microphysics
type(trainable_engine_t) trainable_engine
type(inference_engine_t) inference_engine
type(mini_batch_t), allocatable :: mini_batches(:)
type(tensor_t), allocatable, dimension(:,:) :: inputs, outputs
type(tensor_t), allocatable, dimension(:) :: tmp1, tmp2
type(bin_t), allocatable :: bins(:)
type(input_output_pair_t), allocatable :: input_output_pairs(:)
type(tensor_t), allocatable, dimension(:) :: inputs, outputs
integer, parameter :: mini_batch_size=1
integer batch, lon, lat, level, time
type(file_t) json_file
real(rkind), allocatable :: cost(:)

trainable_engine = hidden_layers(num_hidden_layers=8, nodes_per_hidden_layer=16, num_inputs=8, num_outputs=6, random=.true.)
trainable_engine= hidden_layers(num_hidden_layers=12, nodes_per_hidden_layer=16, num_inputs=8, num_outputs=6, random=.true.)

associate(num_mini_batches => size(qc_in,1)*size(qc_in,2)*size(qc_in,3))

Expand All @@ -168,41 +169,48 @@ program train_cloud_microphysics

! The following temporary copies are required by gfortran bug 100650 and possibly 49324
! See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100650 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49324
tmp1 = [( [( [( &
inputs = [( [( [( &
tensor_t( &
[ pressure_in(lon,lat,level,time), potential_temperature_in(lon,lat,level,time), temperature_in(lon,lat,level,time),&
qv_in(lon,lat,level,time), qc_in(lon,lat,level,time), qi_in(lon,lat,level,time), qr_in(lon,lat,level,time), &
qs_in(lon,lat,level,time) &
] &
), lon = 1, size(qv_in,1))], lat = 1, size(qv_in,2))], level = 1, size(qv_in,3))]

tmp2 = [( [( [( &
outputs = [( [( [( &
tensor_t( &
[dpt_dt(lon,lat,level,time), dqv_dt(lon,lat,level,time), dqc_dt(lon,lat,level,time), &
dqi_dt(lon,lat,level,time), dqr_dt(lon,lat,level,time), dqs_dt(lon,lat,level,time) &
] &
), lon = 1, size(qv_in,1))], lat = 1, size(qv_in,2))], level = 1, size(qv_in,3))]

eliminate_zeros: &
eliminate_zero_derivatives: &
block
integer i
real, allocatable :: harvest(:)

if (time==1) print *,"Eliminating grid points where all time derivatives are zero"

associate(num_grid_pts => size(tmp2))
associate(non_zero_derivatives => [(any(tmp2(i)%values()/=0.), i=1,num_grid_pts)])
tmp2 = pack(tmp2, non_zero_derivatives)
tmp1 = pack(tmp1, non_zero_derivatives)
if (time==1) print *, size(tmp2), "points with non-zero derivatives out of ", num_grid_pts, " points total"
associate(num_grid_pts => size(outputs))
allocate(harvest(num_grid_pts))
call random_number(harvest)
associate(non_zero_derivatives => [(any(outputs(i)%values()/=0.) .or. harvest(i)<0.3, i=1,num_grid_pts)])
outputs = pack(outputs, non_zero_derivatives)
inputs = pack(inputs, non_zero_derivatives)
if (time==1) print *, size(outputs), "points with non-zero derivatives out of ", num_grid_pts, " points total"
end associate
end associate

end block eliminate_zeros
end block eliminate_zero_derivatives

inputs = reshape(tmp1, [mini_batch_size, size(tmp1)])
outputs = reshape(tmp2, shape(inputs))
input_output_pairs = input_output_pair_t(inputs, outputs)

mini_batches = [(mini_batch_t(input_output_pair_t(inputs(:,batch), outputs(:,batch))), batch = 1, size(tmp1))]
call shuffle(input_output_pairs) ! set up for stochastic gradient descent

associate(num_pairs => size(input_output_pairs), n_bins => size(input_output_pairs)/10000)
bins = [(bin_t(num_items=num_pairs, num_bins=n_bins, bin_number=b), b = 1, n_bins)]
mini_batches = [(mini_batch_t(input_output_pairs(bins(b)%first():bins(b)%last())), b = 1, size(bins))]
end associate

if (time==1) print *,"Training network"
call trainable_engine%train(mini_batches, cost)
Expand Down Expand Up @@ -257,5 +265,24 @@ function hidden_layers(num_hidden_layers, nodes_per_hidden_layer, num_inputs, nu

end function

subroutine shuffle(pairs)
type(input_output_pair_t), intent(inout) :: pairs(:)
type(input_output_pair_t) temp
real harvest(2:size(pairs))
integer i, j

call random_init(image_distinct=.true., repeatable=.true.)
call random_number(harvest)

durstenfeld_shuffle: &
do i = size(pairs), 2, -1
j = 1 + int(harvest(i)*i)
temp = pairs(i)
pairs(i) = pairs(j)
pairs(j) = temp
end do durstenfeld_shuffle

end subroutine

end program train_cloud_microphysics
#endif // __INTEL_FORTRAN
2 changes: 1 addition & 1 deletion fpm.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ maintainer = "rouson@lbl.gov"

[dependencies]
assert = {git = "https://github.com/sourceryinstitute/assert", tag = "1.5.0"}
sourcery = {git = "https://github.com/sourceryinstitute/sourcery", tag = "3.8.2"}
sourcery = {git = "https://github.com/sourceryinstitute/sourcery", tag = "3.9.0"}
netcdf-interfaces = {git = "https://github.com/rouson/netcdf-interfaces.git", branch = "implicit-interfaces"}
3 changes: 2 additions & 1 deletion src/inference_engine/trainable_engine_m.f90
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,12 @@ pure module subroutine assert_consistent(self)
class(trainable_engine_t), intent(in) :: self
end subroutine

pure module subroutine train(self, mini_batches, cost)
pure module subroutine train(self, mini_batches, cost, adam)
implicit none
class(trainable_engine_t), intent(inout) :: self
type(mini_batch_t), intent(in) :: mini_batches(:)
real(rkind), intent(out), allocatable, optional :: cost(:)
logical, intent(in), optional :: adam
end subroutine

elemental module function infer(self, inputs) result(outputs)
Expand Down
69 changes: 61 additions & 8 deletions src/inference_engine/trainable_engine_s.f90
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,36 @@
module procedure train
integer l, batch, mini_batch_size, pair
real(rkind), parameter :: eta = 1.5e0 ! Learning parameter
real(rkind), allocatable :: z(:,:), a(:,:), delta(:,:), dcdw(:,:,:), dcdb(:,:)
real(rkind), allocatable :: &
z(:,:), a(:,:), delta(:,:), dcdw(:,:,:), dcdb(:,:), vdw(:,:,:), sdw(:,:,:), vdb(:,:), sdb(:,:), vdwc(:,:,:), sdwc(:,:,:), &
vdbc(:,:), sdbc(:,:)

type(tensor_t), allocatable :: inputs(:), expected_outputs(:)

call self%assert_consistent

associate(output_layer => ubound(self%n,1))

allocate(a(maxval(self%n), input_layer:output_layer)) ! Activations

allocate(dcdw, mold=self%w) ! Gradient of cost function with respect to weights
allocate(vdw, mold=self%w)
allocate(sdw, mold=self%w)
allocate(vdwc, mold=self%w)
allocate(sdwc, mold=self%w)

allocate(z, mold=self%b) ! z-values: Sum z_j^l = w_jk^{l} a_k^{l-1} + b_j^l
allocate(delta, mold=self%b)
allocate(dcdb, mold=self%b) ! Gradient of cost function with respect with biases
allocate(vdb, mold=self%b)
allocate(sdb, mold=self%b)
allocate(vdbc, mold=self%b)
allocate(sdbc, mold=self%b)

vdw = 0.d0
sdw = 1.d0
vdb = 0.d0
sdb = 1.d0

associate(w => self%w, b => self%b, n => self%n, num_mini_batches => size(mini_batches))

Expand Down Expand Up @@ -137,13 +155,48 @@

end do iterate_through_batch

adjust_weights_and_biases: &
do l = 1,output_layer
dcdb(1:n(l),l) = dcdb(1:n(l),l)/mini_batch_size
b(1:n(l),l) = b(1:n(l),l) - eta*dcdb(1:n(l),l) ! Adjust biases
dcdw(1:n(l),1:n(l-1),l) = dcdw(1:n(l),1:n(l-1),l)/mini_batch_size
w(1:n(l),1:n(l-1),l) = w(1:n(l),1:n(l-1),l) - eta*dcdw(1:n(l),1:n(l-1),l) ! Adjust weights
end do adjust_weights_and_biases
if (present(adam)) then
if (adam) then

block
! Adam parameters
real, parameter :: beta1 = .9
real, parameter :: beta2 = .999
real, parameter :: obeta1 = 1.d0 - beta1
real, parameter :: obeta2 = 1.d0 - beta2
real, parameter :: epsilon = 1.d-8
real, parameter :: alpha = 1.5d0 ! Learning parameter

adjust_weights_and_biases: &
do l = 1,output_layer
dcdw(1:n(l),1:n(l-1),l) = dcdw(1:n(l),1:n(l-1),l)/(mini_batch_size)
vdw(1:n(l),1:n(l-1),l) = beta1*vdw(1:n(l),1:n(l-1),l) + obeta1*dcdw(1:n(l),1:n(l-1),l)
sdw (1:n(l),1:n(l-1),l) = beta2*sdw(1:n(l),1:n(l-1),l) + obeta2*(dcdw(1:n(l),1:n(l-1),l)**2)
vdwc(1:n(l),1:n(l-1),l) = vdw(1:n(l),1:n(l-1),l)/(1.D0 - beta1**num_mini_batches)
sdwc(1:n(l),1:n(l-1),l) = sdw(1:n(l),1:n(l-1),l)/(1.D0 - beta2**num_mini_batches)
w(1:n(l),1:n(l-1),l) = w(1:n(l),1:n(l-1),l) &
- alpha*vdwc(1:n(l),1:n(l-1),l)/(sqrt(sdwc(1:n(l),1:n(l-1),l))+epsilon) ! Adjust weights

dcdb(1:n(l),l) = dcdb(1:n(l),l)/mini_batch_size
vdb(1:n(l),l) = beta1*vdb(1:n(l),l) + obeta1*dcdb(1:n(l),l)
sdb(1:n(l),l) = beta2*sdb(1:n(l),l) + obeta2*(dcdb(1:n(l),l)**2)
vdbc(1:n(l),l) = vdb(1:n(l),l)/(1.D0 - beta1**num_mini_batches)
sdbc(1:n(l),l) = sdb(1:n(l),l)/(1.D0 - beta2**num_mini_batches)
b(1:n(l),l) = b(1:n(l),l) - alpha*vdbc(1:n(l),l)/(sqrt(sdbc(1:n(l),l))+epsilon) ! Adjust weights
end do adjust_weights_and_biases
end block
else
error stop "trainable_engine_s(train): for non-adam runs, please rerun without adam argument present"
end if
else
adjust_weights_and_biases: &
do l = 1,output_layer
dcdb(1:n(l),l) = dcdb(1:n(l),l)/mini_batch_size
b(1:n(l),l) = b(1:n(l),l) - eta*dcdb(1:n(l),l) ! Adjust biases
dcdw(1:n(l),1:n(l-1),l) = dcdw(1:n(l),1:n(l-1),l)/mini_batch_size
w(1:n(l),1:n(l-1),l) = w(1:n(l),1:n(l-1),l) - eta*dcdw(1:n(l),1:n(l-1),l) ! Adjust weights
end do adjust_weights_and_biases
end if

end do iterate_across_batches

Expand Down
10 changes: 6 additions & 4 deletions test/trainable_engine_test_m.f90
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ function two_random_hidden_layers() result(trainable_engine)

call random_number(b)
call random_number(w)
b = 2*b
w = 2*w

trainable_engine = trainable_engine_t( &
nodes = neurons, weights = w, biases = b, differentiable_activation_strategy = sigmoid_t(), metadata = &
Expand Down Expand Up @@ -164,7 +166,7 @@ function and_gate_with_skewed_training_data() result(test_passes)
mini_batches = [(mini_batch_t(input_output_pair_t(training_inputs(:,iter), training_outputs(:,iter))), iter=1, num_iterations)]
trainable_engine = two_zeroed_hidden_layers()

call trainable_engine%train(mini_batches)
call trainable_engine%train(mini_batches,adam=.true.)

test_inputs = [tensor_t([true,true]), tensor_t([false,true]), tensor_t([true,false]), tensor_t([false,false])]
expected_test_outputs = [(and(test_inputs(i)), i=1, size(test_inputs))]
Expand Down Expand Up @@ -208,7 +210,7 @@ function not_and_gate_with_skewed_training_data() result(test_passes)
mini_batches = [(mini_batch_t(input_output_pair_t(training_inputs(:,iter), training_outputs(:,iter))), iter=1, num_iterations)]
trainable_engine = two_zeroed_hidden_layers()

call trainable_engine%train(mini_batches)
call trainable_engine%train(mini_batches, adam=.true.)

test_inputs = [tensor_t([true,true]), tensor_t([false,true]), tensor_t([true,false]), tensor_t([false,false])]
expected_test_outputs = [(not_and(test_inputs(i)), i=1, size(test_inputs))]
Expand Down Expand Up @@ -250,7 +252,7 @@ function or_gate_with_random_weights() result(test_passes)
mini_batches = [(mini_batch_t(input_output_pair_t(training_inputs(:,iter), training_outputs(:,iter))), iter=1, num_iterations)]
trainable_engine = two_random_hidden_layers()

call trainable_engine%train(mini_batches)
call trainable_engine%train(mini_batches, adam=.true.)

test_inputs = [tensor_t([true,true]), tensor_t([false,true]), tensor_t([true,false]), tensor_t([false,false])]
expected_test_outputs = [(or(test_inputs(i)), i=1, size(test_inputs))]
Expand Down Expand Up @@ -292,7 +294,7 @@ function xor_gate_with_random_weights() result(test_passes)
mini_batches = [(mini_batch_t(input_output_pair_t(training_inputs(:,iter), training_outputs(:,iter))), iter=1, num_iterations)]
trainable_engine = two_random_hidden_layers()

call trainable_engine%train(mini_batches)
call trainable_engine%train(mini_batches, adam=.true.)

test_inputs = [tensor_t([true,true]), tensor_t([false,true]), tensor_t([true,false]), tensor_t([false,false])]
expected_test_outputs = [(xor(test_inputs(i)), i=1, size(test_inputs))]
Expand Down

0 comments on commit 6049804

Please sign in to comment.