Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow overriding of max_in_flight values when resolving deployment problems #2452

Merged
merged 2 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/bosh-director/lib/bosh/director.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ module Director
require 'bosh/director/formatter_helper'
require 'bosh/director/tagged_logger'
require 'bosh/director/duplicate_detector'
require 'bosh/director/numerical_value_calculator'

require 'bosh/director/version'
require 'bosh/director/config'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,15 @@ def initialize(config)

put '/:deployment/problems', consumes: [:json] do
payload = json_decode(request.body.read)
start_task { @problem_manager.apply_resolutions(current_user, deployment, payload['resolutions']) }

start_task do
@problem_manager.apply_resolutions(
current_user,
deployment,
payload['resolutions'],
payload['max_in_flight_overrides'] || {},
)
end
end

put '/:deployment/scan_and_fix', consumes: :json do
Expand Down
4 changes: 2 additions & 2 deletions src/bosh-director/lib/bosh/director/api/problem_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def get_problems(deployment)
Models::DeploymentProblem.filter(filters).order(:created_at).all
end

def apply_resolutions(username, deployment, resolutions)
JobQueue.new.enqueue(username, Jobs::CloudCheck::ApplyResolutions, 'apply resolutions', [deployment.name, resolutions], deployment)
def apply_resolutions(username, deployment, resolutions, max_in_flight_overrides)
JobQueue.new.enqueue(username, Jobs::CloudCheck::ApplyResolutions, 'apply resolutions', [deployment.name, resolutions, max_in_flight_overrides], deployment)
end

def scan_and_fix(username, deployment, jobs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,11 @@ def to_hash
end

def canaries(size)
get_numerical_value(@canaries_before_calculation, size)
NumericalValueCalculator.get_numerical_value(@canaries_before_calculation, size)
end

def max_in_flight(size)
value = get_numerical_value(@max_in_flight_before_calculation, size)
value = NumericalValueCalculator.get_numerical_value(@max_in_flight_before_calculation, size)
value < 1 ? 1 : value
end

Expand Down Expand Up @@ -149,19 +149,6 @@ def serial?
def update_azs_in_parallel_on_initial_deploy?
@initial_deploy_az_update_strategy == PARALLEL_AZ_UPDATE_STRATEGY
end

private

def get_numerical_value(value, size)
case value
when /^\d+%$/
[((/\d+/.match(value)[0].to_i * size) / 100).round, size].min
when /\A[-+]?[0-9]+\z/
value.to_i
else
raise 'cannot be calculated'
end
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ def self.job_type

# @param [String] deployment_name Deployment name
# @param [Hash] resolutions Problem resolutions
def initialize(deployment_name, resolutions)
# @param [Hash] max_in_flight_overrides Instance Group max_in_flight overrides
def initialize(deployment_name, resolutions, max_in_flight_overrides)
@deployment_manager = Api::DeploymentManager.new
@deployment = @deployment_manager.find_by_name(deployment_name)

Expand All @@ -28,12 +29,19 @@ def initialize(deployment_name, resolutions)
hash
}

# Normalizing max_in_flight
@max_in_flight_overrides =
max_in_flight_overrides.inject({}) { |hash, (instance_group, override)|
hash[instance_group] = override.to_s
hash
}

@problem_resolver = ProblemResolver.new(@deployment)
end

def perform
with_deployment_lock(@deployment) do
count, error_message = @problem_resolver.apply_resolutions(@resolutions)
count, error_message = @problem_resolver.apply_resolutions(@resolutions, @max_in_flight_overrides)

if error_message
raise Bosh::Director::ProblemHandlerError, error_message
Expand Down
14 changes: 14 additions & 0 deletions src/bosh-director/lib/bosh/director/numerical_value_calculator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module Bosh::Director
class NumericalValueCalculator
def self.get_numerical_value(value, size)
case value
when /^\d+%$/
[((/\d+/.match(value)[0].to_i * size) / 100).round, size].min
when /\A[-+]?[0-9]+\z/
value.to_i
else
raise 'cannot be calculated'
end
end
end
end
19 changes: 12 additions & 7 deletions src/bosh-director/lib/bosh/director/problem_resolver.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ def track_and_log(task, log = true)
end
end

def apply_resolutions(resolutions)
def apply_resolutions(resolutions, max_in_flight_overrides={})
@resolutions = resolutions
all_problems = Models::DeploymentProblem.where(id: resolutions.keys, deployment_id: @deployment.id).all
begin_stage('Applying problem resolutions', all_problems.size)

if Config.parallel_problem_resolution && all_problems.size > 1
igs_to_problems = problems_by_instance_group(all_problems)
partitions_to_problem_igs = partition_instance_groups_by_serial(igs_to_problems.keys)
process_partitions(partitions_to_problem_igs, igs_to_problems)
process_partitions(partitions_to_problem_igs, igs_to_problems, max_in_flight_overrides)
else
all_problems.each do |problem|
process_problem(problem)
Expand Down Expand Up @@ -82,23 +82,28 @@ def partition_instance_groups_by_serial(all_igs_with_problems)
]
end

def process_partitions(ig_partition_to_ig_names_with_problems, igs_to_problems)
def process_partitions(ig_partition_to_ig_names_with_problems, igs_to_problems, igs_to_max_in_flight)
ig_partition_to_ig_names_with_problems.each do |ig_partition, igs_with_problems|
num_threads = ig_partition.first.update.serial? ? 1 : igs_with_problems.size

parallel_each(num_threads, igs_with_problems) do |ig_name|
process_instance_group(ig_name, igs_to_problems[ig_name])
process_instance_group(ig_name, igs_to_problems[ig_name], igs_to_max_in_flight[ig_name])
end
end
end

def process_instance_group(ig_name, problems)
def process_instance_group(ig_name, problems, max_in_flight_override)
instance_group = @instance_groups.find do |plan_ig|
plan_ig.name == ig_name
end
max_in_flight = instance_group.update.max_in_flight(problems.size)

num_threads = [problems.size, max_in_flight].min
if max_in_flight_override
numerical_override_value = NumericalValueCalculator.get_numerical_value(max_in_flight_override, problems.size)
num_threads = [problems.size, numerical_override_value].min
else
max_in_flight = instance_group.update.max_in_flight(problems.size)
num_threads = [problems.size, max_in_flight].min
end
parallel_each(num_threads, problems) do |problem|
process_problem(problem)
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1930,13 +1930,52 @@ def manifest_with_errand(deployment_name='errand')
post '/mycloud/scans'
expect_redirect_to_queued_task(last_response)

put '/mycloud/problems', JSON.generate('solutions' => { 42 => 'do_this', 43 => 'do_that', 44 => nil }), { 'CONTENT_TYPE' => 'application/json' }
put '/mycloud/problems', JSON.generate(
'resolutions' => { 42 => 'do_this', 43 => 'do_that', 44 => nil },
'max_in_flight_overrides' => {'diego_cell' => '3', 'router' => '50%'},
), { 'CONTENT_TYPE' => 'application/json' }
expect_redirect_to_queued_task(last_response)

put '/mycloud/problems', JSON.generate('solution' => 'default'), { 'CONTENT_TYPE' => 'application/json' }
expect_redirect_to_queued_task(last_response)
end

context 'fixing problems' do
let(:resolutions) do
{ '1' => 'foo', '4' => 'bar' }
end

let(:max_in_flight_overrides) do
{ 'router' => '3', 'diego_cell' => '50%' }
end

before do
expect_any_instance_of(ProblemManager)
.to receive(:apply_resolutions)
.with(anything, deployment, resolutions, max_in_flight_overrides)
.and_return(Models::Task.make)
end

it 'should pass resolutions and overrides to the problem manager' do
put '/mycloud/problems', JSON.generate(
'resolutions' => resolutions,
'max_in_flight_overrides' => max_in_flight_overrides,
), { 'CONTENT_TYPE' => 'application/json' }
expect_redirect_to_queued_task(last_response)
end

context 'when no max_in_flight_overrides are provided' do
let!(:max_in_flight_overrides) { {} }

it 'should pass an empty hash for overrides to the problem manager' do
put '/mycloud/problems', JSON.generate(
'resolutions' => resolutions,
), { 'CONTENT_TYPE' => 'application/json' }
expect_redirect_to_queued_task(last_response)
end
end
end

context 'listing problems' do
let(:problems) do
[
Expand Down
5 changes: 3 additions & 2 deletions src/bosh-director/spec/unit/api/problem_manager_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ module Bosh::Director

describe '#apply_resolutions' do
let(:resolutions) { double('Resolutions') }
let(:max_in_flight_overrides) { double('Max_in_flight_overrides') }

it 'enqueues a task' do
expect(job_queue).to receive(:enqueue).with(
username, Jobs::CloudCheck::ApplyResolutions, 'apply resolutions',
[deployment.name, resolutions], deployment).and_return(task)
expect(subject.apply_resolutions(username, deployment, resolutions)).to eq(task)
[deployment.name, resolutions, max_in_flight_overrides], deployment).and_return(task)
expect(subject.apply_resolutions(username, deployment, resolutions, max_in_flight_overrides)).to eq(task)
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,25 @@ module Bosh::Director
let(:resolutions) do
{ 1 => 'delete_disk', 2 => 'ignore' }
end
let(:max_in_flight_overrides) do
{ 'diego_cell' => '100%', 'router' => 5 }
end
let(:normalized_max_in_flight_overrides) do
{ 'diego_cell' => '100%', 'router' => '5' }
end
let(:normalized_resolutions) do
{ '1' => 'delete_disk', '2' => 'ignore' }
end
let(:job) { described_class.new('deployment', resolutions) }
let(:job) { described_class.new('deployment', resolutions, max_in_flight_overrides) }
let(:resolver) { instance_double('Bosh::Director::ProblemResolver') }
let(:deployment) { Models::Deployment.first }

describe '#perform' do
context 'when resolution succeeds' do
it 'should normalize the problem ids' do
it 'should normalize the problem ids and overrides' do
allow(job).to receive(:with_deployment_lock).and_yield

expect(resolver).to receive(:apply_resolutions).with(normalized_resolutions)
expect(resolver).to receive(:apply_resolutions).with(normalized_resolutions, normalized_max_in_flight_overrides)

job.perform
end
Expand Down
25 changes: 25 additions & 0 deletions src/bosh-director/spec/unit/numerical_value_calculator_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require 'spec_helper'

module Bosh::Director
describe NumericalValueCalculator do
describe '#get_numerical_value' do
context 'when value is a string representation of an integer' do
it 'returns the integer value' do
expect(NumericalValueCalculator.get_numerical_value('10', 10)).to eq(10)
end
end

context 'when value is a percentage' do
it 'returns that percentage of size' do
expect(NumericalValueCalculator.get_numerical_value('33%', 10)).to eq(3)
end
end

context 'when value is not the right format' do
it 'raises' do
expect { NumericalValueCalculator.get_numerical_value('foobar', 10) }.to raise_error(/cannot be calculated/)
end
end
end
end
end
39 changes: 37 additions & 2 deletions src/bosh-director/spec/unit/problem_resolver_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def inactive_disk(id, deployment_id = nil)
state: 'open')
end

def test_instance_apply_resolutions
def test_instance_apply_resolutions(max_in_flight_overrides={})
problem_resolutions = {}
(1..num_problem_instance_groups).each do |n|
problems_per_instance_group.times do
Expand All @@ -70,7 +70,7 @@ def test_instance_apply_resolutions
resolver = make_resolver(@deployment)
allow_any_instance_of(ProblemHandlers::MissingVM).to receive(:apply_resolution).with('recreate_vm')

expect(resolver.apply_resolutions(problem_resolutions)).to eq([problem_resolutions.size, nil])
expect(resolver.apply_resolutions(problem_resolutions, max_in_flight_overrides)).to eq([problem_resolutions.size, nil])
expect(Models::DeploymentProblem.filter(state: 'open').count).to eq(0)
end

Expand Down Expand Up @@ -148,6 +148,25 @@ def test_instance_apply_resolutions
).times.with(max_threads: problems_per_instance_group)
end
end

context 'when max_in_flight_overrides are provided' do
let(:num_problem_instance_groups) { 3 }
let(:problems_per_instance_group) { 10 }
let(:max_in_flight_overrides) do
{
'ig-2' => '2',
'ig-3' => '40%',
}
end

it 'overrides max_in_flight for the instance groups overridden' do
test_instance_apply_resolutions(max_in_flight_overrides)
expect(ThreadPool).to have_received(:new).once.with(max_threads: num_problem_instance_groups)
expect(ThreadPool).to have_received(:new).once.with(max_threads: max_in_flight)
expect(ThreadPool).to have_received(:new).once.with(max_threads: 2)
expect(ThreadPool).to have_received(:new).once.with(max_threads: 4)
end
end
end

context 'when parallel resurrection is turned off' do
Expand All @@ -157,6 +176,22 @@ def test_instance_apply_resolutions
test_instance_apply_resolutions
expect(ThreadPool).not_to have_received(:new)
end

context 'when max_in_flight_overrides are provided' do
let(:num_problem_instance_groups) { 3 }
let(:problems_per_instance_group) { 10 }
let(:max_in_flight_overrides) do
{
'ig-2' => '2',
'ig-3' => '40%',
}
end

it 'does not use them' do
test_instance_apply_resolutions(max_in_flight_overrides)
expect(ThreadPool).not_to have_received(:new)
end
end
end

context 'when instance group contains disk problems' do
Expand Down