From 72d0e8171df4f4e1f717ab72ba644d4f96260fba Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Mon, 6 Mar 2023 14:02:34 -0800 Subject: [PATCH] test: Add FVT tests for PVC storage Related: #230, #267 Signed-off-by: Christian Kadner --- .github/workflows/run-fvt.yml | 2 +- Makefile | 4 +- config/dependencies/fvt.yaml | 127 +++++++++++++ config/dependencies/minio-storage-secret.yaml | 14 +- config/manager/kustomization.yaml | 1 - config/runtimes/ovms-1.x.yaml | 3 + config/runtimes/triton-2.x.yaml | 3 + docs/quickstart.md | 3 +- fvt/README.md | 2 +- fvt/fvtclient.go | 25 ++- fvt/helpers.go | 115 +++++++++--- fvt/predictor/isvc_test.go | 2 +- fvt/storage/storage_suite_test.go | 123 +++++++++++++ fvt/storage/storage_test.go | 171 ++++++++++++++++++ fvt/testdata/isvcs/isvc-pvc-2.yaml | 12 ++ fvt/testdata/isvcs/isvc-pvc-3.yaml | 14 ++ fvt/testdata/isvcs/isvc-pvc-4.yaml | 14 ++ fvt/testdata/isvcs/isvc-pvc-path.yaml | 16 ++ fvt/testdata/isvcs/isvc-pvc-uri.yaml | 12 ++ scripts/install.sh | 16 +- 20 files changed, 630 insertions(+), 49 deletions(-) create mode 100644 fvt/storage/storage_suite_test.go create mode 100644 fvt/storage/storage_test.go create mode 100644 fvt/testdata/isvcs/isvc-pvc-2.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-3.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-4.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-path.yaml create mode 100644 fvt/testdata/isvcs/isvc-pvc-uri.yaml diff --git a/.github/workflows/run-fvt.yml b/.github/workflows/run-fvt.yml index 65f754bbf..397aebe73 100644 --- a/.github/workflows/run-fvt.yml +++ b/.github/workflows/run-fvt.yml @@ -79,7 +79,7 @@ jobs: run: | docker images kubectl get pods - kubectl get servingruntimes + kubectl get clusterservingruntimes - name: Run FVTs run: | go install github.com/onsi/ginkgo/v2/ginkgo diff --git a/Makefile b/Makefile index 29027257f..7274e4779 100644 --- a/Makefile +++ b/Makefile @@ -46,8 +46,10 @@ test: go test -coverprofile cover.out `go list ./... | grep -v fvt` # Run fvt tests. This requires an etcd, kubernetes connection, and model serving installation. Ginkgo CLI is used to run them in parallel +# TODO: reinstate all FVT suites fvt: - ginkgo -v -p -progress --fail-fast fvt/predictor fvt/scaleToZero --timeout=40m + @#ginkgo -v -p --fail-fast fvt/predictor fvt/scaleToZero fvt/storage --timeout=40m + ginkgo --fail-fast fvt/storage --timeout=40m --succinct # Command to regenerate the grpc go files from the proto files fvt-protoc: diff --git a/config/dependencies/fvt.yaml b/config/dependencies/fvt.yaml index 4ae036424..816309d51 100644 --- a/config/dependencies/fvt.yaml +++ b/config/dependencies/fvt.yaml @@ -123,3 +123,130 @@ stringData: "default_bucket": "modelmesh-example-models", "region": "us-south" } + pvc1: | + { + "type": "pvc", + "name": "models-pvc-1" + } + pvc2: | + { + "type": "pvc", + "name": "models-pvc-2" + } +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-1" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-2" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "models-pvc-3" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "pvc-init" +spec: + template: + metadata: + name: "pvc-init-pod" + spec: + restartPolicy: OnFailure + containers: + - name: "copy-pod" + image: kserve/modelmesh-minio-examples:latest + securityContext: + runAsUser: 0 + allowPrivilegeEscalation: false + command: ["/bin/sh", "-ex", "-c"] + args: + - echo copy model files ...; + whoami; + ls -al "${SRC_FOLDER}"; + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_1}" && + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_2}" && + cp -r "${SRC_FOLDER}"/* "${DST_FOLDER_3}" && + ls -al "${DST_FOLDER_1}" && + ls -al "${DST_FOLDER_2}" && + ls -al "${DST_FOLDER_3}" && + echo done && + exit 0; + env: + - name: SRC_FOLDER + value: "/data1/modelmesh-example-models" + - name: DST_FOLDER_1 + value: "/mnt/pvc1" + - name: DST_FOLDER_2 + value: "/mnt/pvc2" + - name: DST_FOLDER_3 + value: "/mnt/pvc3" + volumeMounts: + - name: "pvc1" + mountPath: "/mnt/pvc1" + - name: "pvc2" + mountPath: "/mnt/pvc2" + - name: "pvc3" + mountPath: "/mnt/pvc3" + volumes: + - name: "pvc1" + persistentVolumeClaim: + claimName: "models-pvc-1" + - name: "pvc2" + persistentVolumeClaim: + claimName: "models-pvc-2" + - name: "pvc3" + persistentVolumeClaim: + claimName: "models-pvc-3" + backoffLimit: 4 +--- +apiVersion: v1 +kind: Pod +metadata: + name: "pvc-reader" +spec: + containers: + - name: main + image: ubuntu + command: ["/bin/sh", "-ec", "sleep 10000"] + volumeMounts: + - name: "pvc1" + mountPath: "/mnt/pvc1" + - name: "pvc2" + mountPath: "/mnt/pvc2" + - name: "pvc3" + mountPath: "/mnt/pvc3" + volumes: + - name: "pvc1" + persistentVolumeClaim: + claimName: "models-pvc-1" + - name: "pvc2" + persistentVolumeClaim: + claimName: "models-pvc-2" + - name: "pvc3" + persistentVolumeClaim: + claimName: "models-pvc-3" diff --git a/config/dependencies/minio-storage-secret.yaml b/config/dependencies/minio-storage-secret.yaml index 5140a8113..1c456d353 100644 --- a/config/dependencies/minio-storage-secret.yaml +++ b/config/dependencies/minio-storage-secret.yaml @@ -1,5 +1,17 @@ +# Copyright 2021 IBM Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: v1 - kind: Secret metadata: name: storage-config diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 55e768ba0..79624fcc7 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -17,5 +17,4 @@ resources: images: - name: modelmesh-controller newName: kserve/modelmesh-controller - ## NOTE THIS SHOULD BE REPLACED WITH LATEST CONTROLLER IMAGE TAG newTag: latest diff --git a/config/runtimes/ovms-1.x.yaml b/config/runtimes/ovms-1.x.yaml index e55350958..541788bb3 100644 --- a/config/runtimes/ovms-1.x.yaml +++ b/config/runtimes/ovms-1.x.yaml @@ -57,3 +57,6 @@ spec: runtimeManagementPort: 8888 memBufferBytes: 134217728 modelLoadingTimeoutMillis: 90000 + + # TODO: re-enable OpenVino runtime + disabled: true diff --git a/config/runtimes/triton-2.x.yaml b/config/runtimes/triton-2.x.yaml index 9b1cd6140..7f9d85a88 100644 --- a/config/runtimes/triton-2.x.yaml +++ b/config/runtimes/triton-2.x.yaml @@ -93,3 +93,6 @@ spec: runtimeManagementPort: 8001 memBufferBytes: 134217728 modelLoadingTimeoutMillis: 90000 + + # TODO: re-enable Triton runtime + disabled: true diff --git a/docs/quickstart.md b/docs/quickstart.md index a75f70e94..7eb524366 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -270,7 +270,8 @@ To see more detailed instructions and information, click [here](./predictors/run ## 4. (Optional) Deleting your ModelMesh Serving installation -To delete all ModelMesh Serving resources that were installed, run the following from the root of the project: +To delete all ModelMesh Serving resources that were installed, run the following +command from the root of the project: ```shell ./scripts/delete.sh --namespace modelmesh-serving diff --git a/fvt/README.md b/fvt/README.md index c3c312f17..ef5572435 100644 --- a/fvt/README.md +++ b/fvt/README.md @@ -33,7 +33,7 @@ If starting with a fresh namespace, install ModelMesh Serving configured for the ./scripts/install.sh --namespace modelmesh-serving --fvt --dev-mode-logging ``` -To re-configure an existing quick-start instance for FVTs, run: +To re-configure an existing "quickstart" deployment for FVTs, run: ```Shell kubectl apply -f config/dependencies/fvt.yaml diff --git a/fvt/fvtclient.go b/fvt/fvtclient.go index e6d1b5b73..0d2b4426b 100644 --- a/fvt/fvtclient.go +++ b/fvt/fvtclient.go @@ -57,8 +57,8 @@ import ( torchserveapi "github.com/kserve/modelmesh-serving/fvt/generated/torchserve/apis" ) -const predictorTimeout = time.Second * 120 -const timeForStatusToStabilize = time.Second * 5 +const PredictorTimeout = time.Second * 180 +const timeForStatusToStabilize = time.Second * 60 type ModelServingConnectionType int @@ -374,6 +374,13 @@ func (fvt *FVTClient) PrintIsvcs() { } } +func (fvt *FVTClient) PrintDescribeIsvc(name string) { + err := fvt.RunKubectl("describe", "isvc", name) + if err != nil { + fvt.log.Error(err, fmt.Sprintf("Error running describe isvc '%s' command", name)) + } +} + func (fvt *FVTClient) PrintPods() { err := fvt.RunKubectl("get", "pods") if err != nil { @@ -412,12 +419,12 @@ func (fvt *FVTClient) TailPodLogs(sinceTime string) { func (fvt *FVTClient) RunKubectl(args ...string) error { args = append(args, "-n", fvt.namespace) - getPredictorCommand := exec.Command("kubectl", args...) - getPredictorCommand.Stdout = ginkgo.GinkgoWriter - getPredictorCommand.Stderr = ginkgo.GinkgoWriter - fvt.log.Info("Running command", "args", strings.Join(getPredictorCommand.Args, " ")) + kubectlCmd := exec.Command("kubectl", args...) + kubectlCmd.Stdout = ginkgo.GinkgoWriter + kubectlCmd.Stderr = ginkgo.GinkgoWriter + fvt.log.Info("Running command", "args", strings.Join(kubectlCmd.Args, " ")) fmt.Fprintf(ginkgo.GinkgoWriter, "=====================================================================================================================================\n") - err := getPredictorCommand.Run() + err := kubectlCmd.Run() fmt.Fprintf(ginkgo.GinkgoWriter, "=====================================================================================================================================\n") return err } @@ -504,11 +511,11 @@ func (fvt *FVTClient) ConnectToModelServing(connectionType ModelServingConnectio } if err := fvt.grpcPortForward.EnsureStarted(); err != nil { - return fmt.Errorf("Error with grpc port-forward, could not connect to model serving") + return fmt.Errorf("Error with gRPC port-forward, could not connect to model serving") } if err := fvt.restPortForward.EnsureStarted(); err != nil { - return fmt.Errorf("Error with rest port-forward, could not connect to model serving") + return fmt.Errorf("Error with REST port-forward, could not connect to model serving") } ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) diff --git a/fvt/helpers.go b/fvt/helpers.go index e586bc46b..753185469 100644 --- a/fvt/helpers.go +++ b/fvt/helpers.go @@ -4,7 +4,7 @@ // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -20,6 +20,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + api "github.com/kserve/modelmesh-serving/apis/serving/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" utilrand "k8s.io/apimachinery/pkg/util/rand" @@ -59,22 +61,22 @@ func CreatePredictorAndWaitAndExpectLoaded(predictorManifest *unstructured.Unstr createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to be 'Loaded'") + By("Waiting for predictor " + predictorName + " to be 'Loaded'") // TODO: "Standby" (or) "FailedToLoad" states are currently encountered after the "Loading" state but they shouldn't be (see issue#994) resultingPredictor := WaitForLastStateInExpectedList("activeModelState", []string{"Pending", "Loading", "Standby", "FailedToLoad", "Loading", "Loaded"}, watcher) ExpectPredictorState(resultingPredictor, true, "Loaded", "", "UpToDate") return resultingPredictor } -func CreateIsvcAndWaitAndExpectReady(isvcManifest *unstructured.Unstructured) *unstructured.Unstructured { +func CreateIsvcAndWaitAndExpectReady(isvcManifest *unstructured.Unstructured, timeout time.Duration) *unstructured.Unstructured { isvcName := isvcManifest.GetName() By("Creating inference service " + isvcName) - watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, DefaultTimeout) + watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, int64(timeout.Seconds())) defer watcher.Stop() FVTClientInstance.CreateIsvcExpectSuccess(isvcManifest) - By("Waiting for inference service" + isvcName + " to be 'Ready'") + By("Waiting for inference service " + isvcName + " to be 'Ready' and model is 'Loaded'") // ISVC does not have the status field set initially. - resultingIsvc := WaitForIsvcReady(watcher) + resultingIsvc := WaitForIsvcState(watcher, api.Loaded, isvcName, timeout) return resultingIsvc } @@ -87,13 +89,25 @@ func CreatePredictorAndWaitAndExpectFailed(predictorManifest *unstructured.Unstr createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to be 'FailedToLoaded'") - // "Standby" state is encountered after the "Loading" state but it shouldn't be + By("Waiting for predictor " + predictorName + " to have 'FailedToLoad'") + // "Standby" state is encountered after the "Loading" state, but it shouldn't be resultingPredictor := WaitForLastStateInExpectedList("activeModelState", []string{"Pending", "Loading", "Standby", "Loading", "FailedToLoad"}, watcher) ExpectPredictorState(resultingPredictor, false, "FailedToLoad", "", "UpToDate") return resultingPredictor } +func CreateIsvcAndWaitAndExpectFailed(isvcManifest *unstructured.Unstructured) *unstructured.Unstructured { + isvcName := isvcManifest.GetName() + By("Creating inference service " + isvcName) + watcher := FVTClientInstance.StartWatchingIsvcs(metav1.ListOptions{FieldSelector: "metadata.name=" + isvcName}, DefaultTimeout) + defer watcher.Stop() + FVTClientInstance.CreateIsvcExpectSuccess(isvcManifest) + By("Waiting for inference service " + isvcName + " to fail") + // ISVC does not have the status field set initially. + resultingIsvc := WaitForIsvcState(watcher, api.FailedToLoad, isvcName, PredictorTimeout) + return resultingIsvc +} + func CreatePredictorAndWaitAndExpectInvalidSpec(predictorManifest *unstructured.Unstructured) *unstructured.Unstructured { predictorName := predictorManifest.GetName() @@ -103,7 +117,7 @@ func CreatePredictorAndWaitAndExpectInvalidSpec(predictorManifest *unstructured. createdPredictor := FVTClientInstance.CreatePredictorExpectSuccess(predictorManifest) ExpectPredictorState(createdPredictor, false, "Pending", "", "UpToDate") - By("Waiting for predictor" + predictorName + " to have transitionStatus 'InvalidSpec'") + By("Waiting for predictor " + predictorName + " to have transitionStatus 'InvalidSpec'") return WaitForLastStateInExpectedList("transitionStatus", []string{"UpToDate", "InvalidSpec"}, watcher) } @@ -174,7 +188,7 @@ func ExpectPredictorFailureInfo(obj *unstructured.Unstructured, reason string, h Expect(actualFailureInfo["location"]).To(BeNil()) } if message != "" { - Expect(actualFailureInfo["message"]).To(Equal(message)) + Expect(actualFailureInfo["message"]).To(ContainSubstring(message)) } else { Expect(actualFailureInfo["message"]).ToNot(BeEmpty()) } @@ -188,47 +202,90 @@ func ExpectPredictorFailureInfo(obj *unstructured.Unstructured, reason string, h } } -func WaitForIsvcReady(watcher watch.Interface) *unstructured.Unstructured { +func ExpectIsvcState(obj *unstructured.Unstructured, activeModelState, targetModelState, transitionStatus string) { + actualActiveModelState := GetString(obj, "status", "modelStatus", "states", "activeModelState") + Expect(actualActiveModelState).To(Equal(activeModelState)) + + actualTargetModel := GetString(obj, "status", "modelStatus", "states", "targetModelState") + Expect(actualTargetModel).To(Equal(targetModelState)) + + actualTransitionStatus := GetString(obj, "status", "modelStatus", "transitionStatus") + Expect(actualTransitionStatus).To(Equal(transitionStatus)) + + if transitionStatus != "BlockedByFailedLoad" && transitionStatus != "InvalidSpec" && + activeModelState != "FailedToLoad" && targetModelState != "FailedToLoad" { + actualFailureInfo := GetMap(obj, "status", "modelStatus", "lastFailureInfo") + Expect(actualFailureInfo).To(BeNil()) + } +} + +func ExpectIsvcFailureInfo(obj *unstructured.Unstructured, reason string, hasLocation bool, hasTime bool, message string) { + actualFailureInfo := GetMap(obj, "status", "modelStatus", "lastFailureInfo") + Expect(actualFailureInfo).ToNot(BeNil()) + + Expect(actualFailureInfo["reason"]).To(Equal(reason)) + if hasLocation { + Expect(actualFailureInfo["location"]).ToNot(BeEmpty()) + } else { + Expect(actualFailureInfo["location"]).To(BeNil()) + } + if message != "" { + Expect(actualFailureInfo["message"]).To(ContainSubstring(message)) + } else { + Expect(actualFailureInfo["message"]).ToNot(BeEmpty()) + } + if !hasTime { + Expect(actualFailureInfo["time"]).To(BeNil()) + } else { + Expect(actualFailureInfo["time"]).ToNot(BeNil()) + actualTime, err := time.Parse(time.RFC3339, actualFailureInfo["time"].(string)) + Expect(err).To(BeNil()) + Expect(time.Since(actualTime) < time.Minute).To(BeTrue()) + } +} + +func WaitForIsvcState(watcher watch.Interface, desiredState api.ModelState, name string, timeout time.Duration) *unstructured.Unstructured { ch := watcher.ResultChan() - isReady := false + reachedDesiredState := false var obj *unstructured.Unstructured - var isvcName string + var isvcName = name - timeout := time.After(predictorTimeout) done := false for !done { select { - // Exit the loop if InferenceService is not ready before given timeout. - case <-timeout: + // exit the loop if InferenceService is not ready before given timeout. + case <-time.After(timeout): done = true + FVTClientInstance.PrintDescribeIsvc(isvcName) case event, ok := <-ch: if !ok { // the channel was closed (watcher timeout reached) done = true + FVTClientInstance.PrintDescribeIsvc(isvcName) break } obj, ok = event.Object.(*unstructured.Unstructured) Expect(ok).To(BeTrue()) isvcName = GetString(obj, "metadata", "name") - conditions, exists := GetSlice(obj, "status", "conditions") + // ISVC does not have the status field set initially + // modelStatus will not exist until status.conditions exist + _, exists := GetSlice(obj, "status", "conditions") if !exists { time.Sleep(time.Second) continue } - for _, condition := range conditions { - conditionMap := condition.(map[string]interface{}) - if conditionMap["type"] == "Ready" { - if conditionMap["status"] == "True" { - isReady = true - done = true - break - } - } + // Note: first status.conditions[{"Type": "Ready", "Status": "True"}] can + // occur before status.conditions[{"Type": "Ready", "Status": "False"}] !!! + activeModelState := GetString(obj, "status", "modelStatus", "states", "activeModelState") + if activeModelState == string(desiredState) { + reachedDesiredState = true + done = true + } else { + time.Sleep(time.Second) } - } } - Expect(isReady).To(BeTrue(), "Timeout before InferenceService '%s' ready", isvcName) + Expect(reachedDesiredState).To(BeTrue(), "Timeout before InferenceService '%s' reached state '%s'", isvcName, desiredState) return obj } @@ -244,7 +301,7 @@ func WaitForLastStateInExpectedList(statusAttribute string, expectedStates []str lastState := "UNSEEN" var predictorName string - timeout := time.After(predictorTimeout) + timeout := time.After(PredictorTimeout) lastStateIndex := 0 done := false for !done { diff --git a/fvt/predictor/isvc_test.go b/fvt/predictor/isvc_test.go index d02254ff8..cc3c31ba5 100644 --- a/fvt/predictor/isvc_test.go +++ b/fvt/predictor/isvc_test.go @@ -44,7 +44,7 @@ var _ = Describe("Inference service", Ordered, func() { It("should successfully load a model", func() { isvcObject := NewIsvcForFVT(i.inferenceServiceFileName) isvcName = isvcObject.GetName() - CreateIsvcAndWaitAndExpectReady(isvcObject) + CreateIsvcAndWaitAndExpectReady(isvcObject, PredictorTimeout) err := FVTClientInstance.ConnectToModelServing(Insecure) Expect(err).ToNot(HaveOccurred()) diff --git a/fvt/storage/storage_suite_test.go b/fvt/storage/storage_suite_test.go new file mode 100644 index 000000000..3ae67719d --- /dev/null +++ b/fvt/storage/storage_suite_test.go @@ -0,0 +1,123 @@ +// Copyright 2023 IBM Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package storage + +import ( + "os" + "testing" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + . "github.com/kserve/modelmesh-serving/fvt" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestStorage(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Storage Suite") +} + +func createFVTClient() { + Log = zap.New(zap.UseDevMode(true), zap.WriteTo(GinkgoWriter)) + Log.Info("Initializing test suite") + + namespace := os.Getenv("NAMESPACE") + if namespace == "" { + namespace = DefaultTestNamespace + } + serviceName := os.Getenv("SERVICENAME") + if serviceName == "" { + serviceName = DefaultTestServiceName + } + controllerNamespace := os.Getenv("CONTROLLERNAMESPACE") + if controllerNamespace == "" { + controllerNamespace = DefaultControllerNamespace + } + NameSpaceScopeMode = os.Getenv("NAMESPACESCOPEMODE") == "true" + Log.Info("Using environment variables", "NAMESPACE", namespace, "SERVICENAME", serviceName, + "CONTROLLERNAMESPACE", controllerNamespace, "NAMESPACESCOPEMODE", NameSpaceScopeMode) + + var err error + FVTClientInstance, err = GetFVTClient(Log, namespace, serviceName, controllerNamespace) + Expect(err).ToNot(HaveOccurred()) + Expect(FVTClientInstance).ToNot(BeNil()) + Log.Info("FVTClientInstance created", "client", FVTClientInstance) +} + +var _ = SynchronizedBeforeSuite(func() []byte { + // runs *only* on process #1 + createFVTClient() + + // confirm 3 cluster serving runtimes or serving runtimes exist + var err error + var list *unstructured.UnstructuredList + if NameSpaceScopeMode { + list, err = FVTClientInstance.ListServingRuntimes(metav1.ListOptions{}) + } else { + list, err = FVTClientInstance.ListClusterServingRuntimes(metav1.ListOptions{}) + } + Expect(err).ToNot(HaveOccurred()) + Expect(list.Items).To(HaveLen(4)) + + FVTClientInstance.SetDefaultUserConfigMap() + + // ensure that there are no predictors to start + FVTClientInstance.DeleteAllPredictors() + FVTClientInstance.DeleteAllIsvcs() + // ensure a stable deploy state + WaitForStableActiveDeployState() + // create TLS secrets before start of tests + FVTClientInstance.CreateTLSSecrets() + return nil +}, func(_ []byte) { + // runs on *all* processes + // create the fvtClient Instance on every other process except the first, since it got created in the above function. + if FVTClientInstance == nil { + createFVTClient() + } + Log.Info("Setup completed") +}) + +var _ = SynchronizedAfterSuite(func() { + // runs on *all* processes + // ensure we clean up any port-forward + FVTClientInstance.DisconnectFromModelServing() +}, func() { + // runs *only* on process #1 + FVTClientInstance.DeleteTLSSecrets() + FVTClientInstance.SetDefaultUserConfigMap() + // restart pods to reset Bootstrap failure checks + FVTClientInstance.RestartDeploys() +}) + +// register handlers for a failed test case to print info to the console +var startTime string +var _ = JustBeforeEach(func() { + startTime = time.Now().Format("2006-01-02T15:04:05Z") +}) +var _ = JustAfterEach(func() { + if CurrentSpecReport().Failed() { + FVTClientInstance.PrintPredictors() + FVTClientInstance.PrintIsvcs() + FVTClientInstance.PrintPods() + FVTClientInstance.PrintDescribeNodes() + FVTClientInstance.PrintEvents() + FVTClientInstance.TailPodLogs(startTime) + } +}) diff --git a/fvt/storage/storage_test.go b/fvt/storage/storage_test.go new file mode 100644 index 000000000..e0d30df92 --- /dev/null +++ b/fvt/storage/storage_test.go @@ -0,0 +1,171 @@ +// Copyright 2023 IBM Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.package storage + +package storage + +import ( + . "github.com/kserve/modelmesh-serving/fvt" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +var isvcFiles = map[string]string{ + "isvc-pvc-storage-uri": "isvc-pvc-uri.yaml", + "isvc-pvc-storage-path": "isvc-pvc-path.yaml", + "isvc-pvc2": "isvc-pvc-2.yaml", + "isvc-pvc3": "isvc-pvc-3.yaml", + "isvc-pvc4": "isvc-pvc-4.yaml", +} + +// ISVCs using PVCs from the FVT `storage-config` Secret (config/dependencies/fvt.yaml) +var isvcWithPvcInStorageConfig = []string{"isvc-pvc-storage-uri", "isvc-pvc-storage-path", "isvc-pvc2"} + +// ISVC using PVC not in the FVT `storage-config` Secret (config/dependencies/fvt.yaml) +// this should work only after setting allowAnyPVC = true +var isvcWithPvcNotInStorageConfig = "isvc-pvc3" + +// ISVC using a PVC that does not exist at all, this ISVC should fail to load +var isvcWithNonExistentPvc = "isvc-pvc4" + +var _ = Describe("ISVCs", Ordered, func() { + + Describe("with PVC in storage-config", Ordered, func() { + + for _, name := range isvcWithPvcInStorageConfig { + + Describe("\""+name+"\"", Ordered, func() { + var isvcName = name + var fileName = isvcFiles[name] + + It("should successfully load a model", func() { + isvcObject := NewIsvcForFVT(fileName) + isvcName = isvcObject.GetName() + CreateIsvcAndWaitAndExpectReady(isvcObject, PredictorTimeout) + }) + + It("should successfully run inference", func() { + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + }) + + AfterAll(func() { + FVTClientInstance.DeleteIsvc(isvcName) + FVTClientInstance.DisconnectFromModelServing() + }) + + }) + } + }) + + Describe("with PVC not in storage-config", Ordered, func() { + var isvcObject *unstructured.Unstructured + + It("should fail with PVC not mounted", func() { + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithPvcNotInStorageConfig]) + + obj := CreateIsvcAndWaitAndExpectFailed(isvcObject) + + By("Asserting on the ISVC state") + ExpectIsvcFailureInfo(obj, "ModelLoadFailed", true, true, "") + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + + It("should load a model when allowAnyPVC", func() { + // This ISVC needs a new PVC which is not in the storage-config secret. + // The controller will update the deployment with the pvc_mount, but + // if the old runtime pods are still around, the ISVC will get deployed + // on an old runtime pod without the PVC mounted and keep failing + // until a new pod with the PVC is ready and the controller finally + // decides to move the ISVC onto the new pod that has the PVC mounted. + // However, this process can take a long time (how long?) so, we take + // some extra measures to increase our chances for quick success: + // - scale to 0 prohibits the new ISVC to land on an old runtime pod + // that does not have the "any" PVC mounted yet + // - use more than 1 pod per runtime so controller will not kill new + // pods that have the PVC mounted but because the ISVC is loaded on + // the old pod (without the PVC) but the old pod gets kept around + // instead of the new because the ISVC is still on there -- even + // though its failing + // - allowAnyPVC needs rest-proxy enabled (not sure why) + config := map[string]interface{}{ + "allowAnyPVC": true, + "podsPerRuntime": 1, + "scaleToZero": map[string]interface{}{ + "enabled": true, + }, + "restProxy": map[string]interface{}{ + "enabled": true, + }, + } + By("Updating the user config to allow any PVC") + FVTClientInstance.ApplyUserConfigMap(config) + + // after applying configmap, the runtime pod(s) restart, wait for stability + By("Waiting for stable deploy state") + WaitForStableActiveDeployState() + + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithPvcNotInStorageConfig]) + + FVTClientInstance.PrintPods() + + // after mounting the new PVC the runtime pod(s) restart again, but the ISVC + // if not scaleToZero, it could have landed on the previous runtime pod will + // fail to load the first time, so we extend the standard predictor timeout + extendedTimeout := PredictorTimeout * 2 + obj := CreateIsvcAndWaitAndExpectReady(isvcObject, extendedTimeout) + ExpectIsvcState(obj, "Loaded", "", "UpToDate") + + FVTClientInstance.PrintPods() + + // since the runtime pod(s) restarted twice, but (some of) the old runtime pods + // are lingering around (Terminating) we may have gotten a defunct connection + // after applying configmap, the runtime pod(s) restart, wait for stability + WaitForStableActiveDeployState() + + err := FVTClientInstance.ConnectToModelServing(Insecure) + Expect(err).ToNot(HaveOccurred()) + + isvcName := isvcObject.GetName() + ExpectSuccessfulInference_sklearnMnistSvm(isvcName) + + FVTClientInstance.DisconnectFromModelServing() + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + + It("should fail with non-existent PVC", func() { + // make a shallow copy of default configmap (don't modify the DefaultConfig reference) + // keeping 1 pod per runtime and don't scale to 0 + config := make(map[string]interface{}) + for k, v := range DefaultConfig { + config[k] = v + } + // update the model-serving-config to allow any PVC + config["allowAnyPVC"] = true + FVTClientInstance.ApplyUserConfigMap(config) + + By("Waiting for stable deploy state") + WaitForStableActiveDeployState() + + isvcObject = NewIsvcForFVT(isvcFiles[isvcWithNonExistentPvc]) + + obj := CreateIsvcAndWaitAndExpectFailed(isvcObject) + ExpectIsvcFailureInfo(obj, "ModelLoadFailed", true, true, "") + + FVTClientInstance.DeleteIsvc(isvcObject.GetName()) + }) + }) +}) diff --git a/fvt/testdata/isvcs/isvc-pvc-2.yaml b/fvt/testdata/isvcs/isvc-pvc-2.yaml new file mode 100644 index 000000000..a0655632a --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-2.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc2 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-2/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-3.yaml b/fvt/testdata/isvcs/isvc-pvc-3.yaml new file mode 100644 index 000000000..1595ab062 --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-3.yaml @@ -0,0 +1,14 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc3 + labels: + name: isvc-pvc3 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-3/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-4.yaml b/fvt/testdata/isvcs/isvc-pvc-4.yaml new file mode 100644 index 000000000..de186276c --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-4.yaml @@ -0,0 +1,14 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc4 + labels: + name: isvc-pvc4 + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-4/sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-path.yaml b/fvt/testdata/isvcs/isvc-pvc-path.yaml new file mode 100644 index 000000000..71db761c0 --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-path.yaml @@ -0,0 +1,16 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc-storage-path + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storage: + parameters: + type: pvc + name: models-pvc-1 + path: sklearn/mnist-svm.joblib diff --git a/fvt/testdata/isvcs/isvc-pvc-uri.yaml b/fvt/testdata/isvcs/isvc-pvc-uri.yaml new file mode 100644 index 000000000..5334b7032 --- /dev/null +++ b/fvt/testdata/isvcs/isvc-pvc-uri.yaml @@ -0,0 +1,12 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: isvc-pvc-storage-uri + annotations: + serving.kserve.io/deploymentMode: ModelMesh +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: pvc://models-pvc-1/sklearn/mnist-svm.joblib diff --git a/scripts/install.sh b/scripts/install.sh index a73ac028f..f4662677c 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -132,7 +132,7 @@ wait_for_pods_ready() { fi wait_counter=$((wait_counter + 1)) - echo " Waiting 10 secs..." + echo " Waiting 10 secs ..." sleep 10 done } @@ -254,7 +254,7 @@ if [[ $quickstart == "true" ]]; then info "Deploying quickstart resources for etcd and minio" kubectl apply -f quickstart.yaml - info "Waiting for dependent pods to be up..." + info "Waiting for dependent pods to be up ..." wait_for_pods_ready "-l app=etcd" wait_for_pods_ready "-l app=minio" fi @@ -264,7 +264,7 @@ if [[ $fvt == "true" ]]; then info "Deploying fvt resources for etcd and minio" kubectl apply -f fvt.yaml - info "Waiting for dependent pods to be up..." + info "Waiting for dependent pods to be up ..." wait_for_pods_ready "-l app=etcd" wait_for_pods_ready "-l app=minio" fi @@ -305,7 +305,7 @@ if [[ $namespace_scope_mode == "true" ]]; then rm crd/kustomization.yaml.bak fi -info "Waiting for ModelMesh Serving controller pod to be up..." +info "Waiting for ModelMesh Serving controller pod to be up ..." wait_for_pods_ready "-l control-plane=modelmesh-controller" # Older versions of kustomize have different load restrictor flag formats. @@ -344,4 +344,12 @@ if [[ $namespace_scope_mode != "true" ]] && [[ ! -z $user_ns_array ]]; then fi rm quickstart.yaml quickstart.yaml.bak fvt.yaml fvt.yaml.bak +# wait for FVT storage resources that take long to initialize +# we don't want to wait earlier to not hold up any setup steps +# that happen after the initial FVT install block +if [[ $fvt == "true" ]]; then + info "Waiting for FVT PVC storage to be initialized ..." + kubectl wait --for=condition=complete --timeout=180s job/pvc-init +fi + success "Successfully installed ModelMesh Serving!"