Skip to content

Commit

Permalink
performance test to compare allocators
Browse files Browse the repository at this point in the history
The current results with 100 works and 15k services on a (n2-standard-48) vCPU: 48 RAM: 192 GB are:
Old allocator:

perf_test.go:139: [RESULT] Duration 1m9.646167533s: [quantile:0.5  value:0.462886801 quantile:0.9  value:0.496662838 quantile:0.99  value:0.725845905]

New allocator:
perf_test.go:139: [RESULT] Duration 2m12.900694343s: [quantile:0.5  value:0.481814448 quantile:0.9  value:1.3867615469999999 quantile:0.99  value:1.888190671]

The new allocator has higher latency but in contrast allow to use a
larger number of services, when tested with 65k Services the old
allocator etcd crashes with storage exceeded.

The scenario is also not realistic, as a continuous and high load on
Service creation is not expected.
  • Loading branch information
aojea committed Jun 27, 2024
1 parent 6639411 commit 8490273
Showing 1 changed file with 140 additions and 0 deletions.
140 changes: 140 additions & 0 deletions test/integration/servicecidr/perf_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package servicecidr

import (
"context"
"fmt"
"testing"
"time"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
clientset "k8s.io/client-go/kubernetes"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/component-base/metrics/testutil"
"k8s.io/kubernetes/cmd/kube-apiserver/app/options"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/test/integration/framework"
"k8s.io/kubernetes/test/utils/ktesting"
netutils "k8s.io/utils/net"
)

// TestServiceAllocPerformance measure the latency to create N services with a parallelism of K
// using the old and the new ClusterIP allocators.
// The test is skipped to run on CI and is left to execute manually to check for possible regressions.
// The current results with 100 works and 15k services on a (n2-standard-48) vCPU: 48 RAM: 192 GB are:
// legacy perf_test.go:139: [RESULT] Duration 1m9.646167533s: [quantile:0.5 value:0.462886801 quantile:0.9 value:0.496662838 quantile:0.99 value:0.725845905]
// new perf_test.go:139: [RESULT] Duration 2m12.900694343s: [quantile:0.5 value:0.481814448 quantile:0.9 value:1.3867615469999999 quantile:0.99 value:1.888190671]
func TestServiceAllocPerformance(t *testing.T) {
t.Skip("KEP-1880 performance comparison")
serviceCreation := metrics.NewHistogram(&metrics.HistogramOpts{
Name: "service_duration_seconds",
Help: "A summary of the Service creation durations in seconds.",
Buckets: metrics.DefBuckets,
})
legacyregistry.MustRegister(serviceCreation)

svc := func(i, j int) *v1.Service {
return &v1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("svc-%v-%v", i, j),
},
Spec: v1.ServiceSpec{
Type: v1.ServiceTypeClusterIP,
Ports: []v1.ServicePort{
{Port: 80},
},
},
}
}

worker := func(client clientset.Interface, id int, jobs <-chan int, results chan<- error) {
for j := range jobs {
t.Logf("Worker: %d Job: %d", id, j)
func() {
now := time.Now()
defer func() {
t.Logf("worker %d job %d took %v", id, j, time.Since(now))
serviceCreation.Observe(time.Since(now).Seconds())
}()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
_, err := client.CoreV1().Services(metav1.NamespaceDefault).Create(ctx, svc(id, j), metav1.CreateOptions{})
if err != nil {
t.Errorf("unexpected error: %v", err)
}
results <- err
}()
}
}

for _, gate := range []bool{false, true} {
t.Run(fmt.Sprintf("feature-gate=%v", gate), func(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.MultiCIDRServiceAllocator, gate)

tCtx := ktesting.Init(t)
client, _, tearDownFn := framework.StartTestServer(tCtx, t, framework.TestServerSetup{
ModifyServerRunOptions: func(opts *options.ServerRunOptions) {
// use the larget range possible , this is limited by the old allocator
opts.ServiceClusterIPRanges = "10.0.0.0/12"
opts.GenericServerRunOptions.AdvertiseAddress = netutils.ParseIPSloppy("10.0.0.1")
opts.APIEnablement.RuntimeConfig.Set("networking.k8s.io/v1alpha1=true") // nolint: errcheck
},
})
defer tearDownFn()

legacyregistry.Reset()

// 100 workers for 15k services
nworkers := 100
nservices := 15000
jobs := make(chan int, nservices)
results := make(chan error, nservices)
t.Log("Starting workers to create ClusterIP Service")
now := time.Now()
for w := 0; w < nworkers; w++ {
t.Logf("Starting worker %d", w)
go worker(client, w, jobs, results)
}
for i := 0; i < nservices; i++ {
t.Logf("Sending job %d", i)
jobs <- i
}
t.Log("All jobs processed")
close(jobs)

for c := 0; c < nservices; c++ {
t.Logf("Getting results %d", c)
err := <-results
if err != nil {
t.Errorf("error creating service: %v", err)
}
}

vec, err := testutil.GetHistogramVecFromGatherer(legacyregistry.DefaultGatherer, serviceCreation.Name, map[string]string{})
if err != nil {
t.Error(err)
}

t.Logf("[RESULT] feature-gate=%v Duration: %v Avg: %.4f p95: %.4f p99: %.4f", gate, time.Since(now), vec.Average(), vec.Quantile(0.95), vec.Quantile(0.99))
})
}
}

0 comments on commit 8490273

Please sign in to comment.