From d74c0760fff370563afa864f2e093c26f61c9676 Mon Sep 17 00:00:00 2001 From: bball Date: Mon, 12 Nov 2018 16:48:54 -0700 Subject: [PATCH 1/2] add retry to RestClient --- docker/R/lib/api_create_datapoint.rb | 13 ++++++++--- .../jobs/dj_jobs/run_simulate_data_point.rb | 22 ++++++++++++++++--- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/docker/R/lib/api_create_datapoint.rb b/docker/R/lib/api_create_datapoint.rb index eb2a09843..1e776f050 100644 --- a/docker/R/lib/api_create_datapoint.rb +++ b/docker/R/lib/api_create_datapoint.rb @@ -84,9 +84,12 @@ # check the response if datapoint_id puts 'Datapoint created, submitting to run queue' - - a = RestClient.put "#{options[:host]}/data_points/#{datapoint_id}/run.json", {} - a = JSON.parse(a, symbolize_names: true) + post_count = 0 + post_count_max = 5 + begin + post_count += 1 + a = RestClient.put "#{options[:host]}/data_points/#{datapoint_id}/run.json", {} + a = JSON.parse(a, symbolize_names: true) # check to make sure that it was submitted and grab the run id if a[:job_id] @@ -132,6 +135,10 @@ sleep options[:sleep_time] end end + rescue => e + retry if post_count <= post_count_max + raise "Posting of the run.json file failed #{post_count_max} times with error #{e.message}" + end end end diff --git a/server/app/jobs/dj_jobs/run_simulate_data_point.rb b/server/app/jobs/dj_jobs/run_simulate_data_point.rb index 661da6197..71c67d85a 100644 --- a/server/app/jobs/dj_jobs/run_simulate_data_point.rb +++ b/server/app/jobs/dj_jobs/run_simulate_data_point.rb @@ -121,12 +121,28 @@ def perform end # delete any existing data files from the server in case this is a 'rerun' - RestClient.delete "#{APP_CONFIG['os_server_host_url']}/data_points/#{@data_point.id}/result_files" - + @sim_logger.info "RestClient delete" + post_count = 0 + post_count_max = 5 + begin + post_count += 1 + RestClient.delete "#{APP_CONFIG['os_server_host_url']}/data_points/#{@data_point.id}/result_files" + rescue => e + retry if post_count <= post_count_max + raise "RestClient.delete failed with error #{e.message}" + end # Download the datapoint to run and save to disk url = "#{APP_CONFIG['os_server_host_url']}/data_points/#{@data_point.id}.json" @sim_logger.info "Downloading datapoint from #{url}" - r = RestClient.get url + post_count = 0 + post_count_max = 5 + begin + post_count += 1 + r = RestClient.get url + rescue => e + retry if post_count <= post_count_max + raise "RestClient.get url failed with error #{e.message}" + end raise 'Datapoint JSON could not be downloaded' unless r.code == 200 # Parse to JSON to save it again with nice formatting File.open("#{simulation_dir}/data_point.json", 'w') { |f| f << JSON.pretty_generate(JSON.parse(r)) } From 7d76d5da1be939da3ce7e397a1b2ca9477ab5b7e Mon Sep 17 00:00:00 2001 From: bball Date: Tue, 13 Nov 2018 10:32:24 -0700 Subject: [PATCH 2/2] better sleep and more retry --- server/app/jobs/dj_jobs/run_simulate_data_point.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/server/app/jobs/dj_jobs/run_simulate_data_point.rb b/server/app/jobs/dj_jobs/run_simulate_data_point.rb index 71c67d85a..733b71307 100644 --- a/server/app/jobs/dj_jobs/run_simulate_data_point.rb +++ b/server/app/jobs/dj_jobs/run_simulate_data_point.rb @@ -123,24 +123,30 @@ def perform # delete any existing data files from the server in case this is a 'rerun' @sim_logger.info "RestClient delete" post_count = 0 - post_count_max = 5 + post_count_max = 50 begin post_count += 1 + @sim_logger.info "delete post_count = #{post_count}" RestClient.delete "#{APP_CONFIG['os_server_host_url']}/data_points/#{@data_point.id}/result_files" rescue => e + sleep Random.new.rand(1.0..10.0) retry if post_count <= post_count_max + @sim_logger.error "RestClient.delete failed with error #{e.message}" raise "RestClient.delete failed with error #{e.message}" end # Download the datapoint to run and save to disk url = "#{APP_CONFIG['os_server_host_url']}/data_points/#{@data_point.id}.json" @sim_logger.info "Downloading datapoint from #{url}" post_count = 0 - post_count_max = 5 + post_count_max = 50 begin post_count += 1 + @sim_logger.info "get url post_count = #{post_count}" r = RestClient.get url rescue => e + sleep Random.new.rand(1.0..10.0) retry if post_count <= post_count_max + @sim_logger.error "RestClient.get url failed with error #{e.message}" raise "RestClient.get url failed with error #{e.message}" end raise 'Datapoint JSON could not be downloaded' unless r.code == 200