kubeflow · k8s-ci-robot · Oct 26, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/examples/v1beta1/bayesianoptimization-example.yaml b/examples/v1beta1/bayesianoptimization-example.yaml
@@ -39,6 +39,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/cmaes-example.yaml b/examples/v1beta1/cmaes-example.yaml
@@ -36,6 +36,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/custom-metricscollector-example.yaml b/examples/v1beta1/custom-metricscollector-example.yaml
@@ -52,6 +52,7 @@ spec:
         min: "0.3"
         max: "0.7"
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/file-metricscollector-example.yaml b/examples/v1beta1/file-metricscollector-example.yaml
@@ -39,6 +39,7 @@ spec:
         min: "0.3"
         max: "0.7"
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/fpga/xgboost-example.yaml b/examples/v1beta1/fpga/xgboost-example.yaml
@@ -35,6 +35,7 @@ spec:
       name: subsample
       parameterType: double
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: alpha
         description: L1 regularization term on weights

diff --git a/examples/v1beta1/grid-example.yaml b/examples/v1beta1/grid-example.yaml
@@ -37,6 +37,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/hyperband-example.yaml b/examples/v1beta1/hyperband-example.yaml
@@ -46,6 +46,7 @@ spec:
         min: "20"
         max: "20"
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/metric-strategy-example.yaml b/examples/v1beta1/metric-strategy-example.yaml
@@ -41,6 +41,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/nas/darts-example-cpu.yaml b/examples/v1beta1/nas/darts-example-cpu.yaml
@@ -40,6 +40,7 @@ spec:
               list:
                 - "3"
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: algorithmSettings
         description: Algorithm settings of DARTS Experiment

diff --git a/examples/v1beta1/nas/darts-example-gpu.yaml b/examples/v1beta1/nas/darts-example-gpu.yaml
@@ -57,6 +57,7 @@ spec:
                 - "3"
       - operationType: skip_connection
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: algorithmSettings
         description: Algorithm settings of DARTS Experiment

diff --git a/examples/v1beta1/nas/enas-example-cpu.yaml b/examples/v1beta1/nas/enas-example-cpu.yaml
@@ -123,6 +123,7 @@ spec:
               max: "3"
               step: "1"
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: neuralNetworkArchitecture
         description: NN architecture contains operations ID on each NN layer and skip connections between layers

diff --git a/examples/v1beta1/nas/enas-example-gpu.yaml b/examples/v1beta1/nas/enas-example-gpu.yaml
@@ -120,6 +120,7 @@ spec:
               max: "3"
               step: "1"
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: neuralNetworkArchitecture
         description: NN architecture contains operations ID on each NN layer and skip connections between layers

diff --git a/examples/v1beta1/pytorchjob-example.yaml b/examples/v1beta1/pytorchjob-example.yaml
@@ -25,6 +25,7 @@ spec:
         min: "0.5"
         max: "0.9"
   trialTemplate:
+    primaryContainerName: pytorch
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/random-example.yaml b/examples/v1beta1/random-example.yaml
@@ -36,6 +36,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/resume-experiment/from-volume-resume.yaml b/examples/v1beta1/resume-experiment/from-volume-resume.yaml
@@ -37,6 +37,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/resume-experiment/never-resume.yaml b/examples/v1beta1/resume-experiment/never-resume.yaml
@@ -37,6 +37,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/tfjob-example.yaml b/examples/v1beta1/tfjob-example.yaml
@@ -32,6 +32,7 @@ spec:
         min: "100"
         max: "200"
   trialTemplate:
+    primaryContainerName: tensorflow
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/tpe-example.yaml b/examples/v1beta1/tpe-example.yaml
@@ -36,6 +36,7 @@ spec:
           - adam
           - ftrl
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/examples/v1beta1/trial-metadata-substitution.yaml b/examples/v1beta1/trial-metadata-substitution.yaml
@@ -23,6 +23,7 @@ spec:
         min: "0.01"
         max: "0.03"
   trialTemplate:
+    primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model

diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml
@@ -25,6 +25,9 @@ spec:
           command: ["./katib-controller"]
           args:
             - "--webhook-port=8443"
+            - "--trial-resources=Job.v1.batch"
+            - "--trial-resources=TFJob.v1.kubeflow.org"
+            - "--trial-resources=PyTorchJob.v1.kubeflow.org"
             - "--trial-resources=MPIJob.v1.kubeflow.org"
           ports:
             - containerPort: 8443

diff --git a/pkg/apis/controller/experiments/v1beta1/constants.go b/pkg/apis/controller/experiments/v1beta1/constants.go
@@ -16,15 +16,32 @@ limitations under the License.
 package v1beta1
 
 const (
-	// Default value of Spec.ParallelTrialCount
+	// DefaultTrialParallelCount is the default value of spec.parallelTrialCount.
 	DefaultTrialParallelCount = 3
 
-	// Default value of Spec.ConfigMapName for Trial template
+	// DefaultTrialConfigMapName is the default value of spec.trialTemplate.configMapName for Trial template.
 	DefaultTrialConfigMapName = "trial-template"
 
-	// Default value of Spec.TemplatePath
+	// DefaultTrialTemplatePath is the default value of spec.trialTemplate.TemplatePath.
 	DefaultTrialTemplatePath = "defaultTrialTemplate.yaml"
 
-	// Default value of Spec.DefaultResumePolicy
+	// DefaultResumePolicy is the default value of spec.resumePolicy.
 	DefaultResumePolicy = LongRunning
+
+	// DefaultJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Job.
+	DefaultJobSuccessCondition = "status.conditions.#(type==\"Complete\")#|#(status==\"True\")#"
+
+	// DefaultJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Job.
+	DefaultJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
+
+	// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Job.
+	DefaultKubeflowJobSuccessCondition = "status.conditions.#(type==\"Succeeded\")#|#(status==\"True\")#"
+
+	// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Job.
+	DefaultKubeflowJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
+)
+
+var (
+	// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Job.
+	DefaultKubeflowJobPrimaryPodLabels = map[string]string{"job-role": "master"}
 )
diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go
diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_types.go b/pkg/apis/controller/experiments/v1beta1/experiment_types.go
@@ -208,7 +208,8 @@ type TrialTemplate struct {
 	// List of parameters that are used in trial template
 	TrialParameters []TrialParameterSpec `json:"trialParameters,omitempty"`
 
-	// Labels that determines if pod needs to be injected by Katib sidecar container
+	// Labels that determines if pod needs to be injected by Katib sidecar container.
+	// If PrimaryPodLabels is omitted, metrics collector wraps all Trial's pods.
 	PrimaryPodLabels map[string]string `json:"primaryPodLabels,omitempty"`
 
 	// Name of training container where actual model training is running

diff --git a/pkg/controller.v1beta1/consts/const.go b/pkg/controller.v1beta1/consts/const.go
@@ -110,11 +110,6 @@ const (
 	// JobKindPyTorch is the kind of PyTorchJob.
 	JobKindPyTorch = "PyTorchJob"
 
-	// built-in JobRoles
-	JobRole        = "job-role"
-	JobRoleTF      = "tf-job-role"
-	JobRolePyTorch = "pytorch-job-role"
-
 	// AnnotationIstioSidecarInjectName is the annotation of Istio Sidecar
 	AnnotationIstioSidecarInjectName = "sidecar.istio.io/inject"
 

diff --git a/pkg/controller.v1beta1/experiment/experiment_consts.go b/pkg/controller.v1beta1/experiment/experiment_consts.go
diff --git a/pkg/controller.v1beta1/experiment/experiment_controller.go b/pkg/controller.v1beta1/experiment/experiment_controller.go
@@ -240,7 +240,7 @@ func (r *ReconcileExperiment) Reconcile(request reconcile.Request) (reconcile.Re
 		if err != nil {
 			logger.Error(err, "Reconcile experiment error")
 			r.recorder.Eventf(instance,
-				corev1.EventTypeWarning, ReconcileFailedReason,
+				corev1.EventTypeWarning, consts.ReconcileErrorReason,
 				"Failed to reconcile: %v", err)
 			return reconcile.Result{}, err
 		}

diff --git a/...r.v1beta1/experiment/experiment_status.go → ...xperiment/experiment_controller_status.go b/...r.v1beta1/experiment/experiment_status.go → ...xperiment/experiment_controller_status.go
diff --git a/pkg/controller.v1beta1/experiment/experiment_controller_test.go b/pkg/controller.v1beta1/experiment/experiment_controller_test.go
@@ -33,9 +33,10 @@ import (
 )
 
 const (
-	experimentName = "test-experiment"
-	trialName      = "test-trial"
-	namespace      = "default"
+	experimentName   = "test-experiment"
+	trialName        = "test-trial"
+	namespace        = "default"
+	primaryContainer = "tensorflow"
 
 	timeout = time.Second * 40
 )
@@ -372,7 +373,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
 						Spec: v1.PodSpec{
 							Containers: []v1.Container{
 								{
-									Name:  "tensorflow",
+									Name:  primaryContainer,
 									Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
 									Command: []string{
 										"python",
@@ -415,6 +416,10 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
 			},
 			ResumePolicy: experimentsv1beta1.NeverResume,
 			TrialTemplate: &experimentsv1beta1.TrialTemplate{
+				PrimaryPodLabels:     experimentsv1beta1.DefaultKubeflowJobPrimaryPodLabels,
+				PrimaryContainerName: primaryContainer,
+				SuccessCondition:     experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
+				FailureCondition:     experimentsv1beta1.DefaultKubeflowJobFailureCondition,
 				TrialParameters: []experimentsv1beta1.TrialParameterSpec{
 					{
 						Name:        "learningRate",
@@ -509,7 +514,7 @@ func newFakeTFJob() *tfv1.TFJob {
 						Spec: v1.PodSpec{
 							Containers: []v1.Container{
 								{
-									Name:  "tensorflow",
+									Name:  primaryContainer,
 									Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
 									Command: []string{
 										"python",

diff --git a/...ler.v1beta1/experiment/experiment_util.go → .../experiment/experiment_controller_util.go b/...ler.v1beta1/experiment/experiment_util.go → .../experiment/experiment_controller_util.go
diff --git a/pkg/controller.v1beta1/experiment/manifest/generator_test.go b/pkg/controller.v1beta1/experiment/manifest/generator_test.go
@@ -2,20 +2,21 @@ package manifest
 
 import (
 	"errors"
-	"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
 	"math"
 	"reflect"
 	"testing"
 
 	"github.com/golang/mock/gomock"
+	batchv1 "k8s.io/api/batch/v1"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+
 	commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
 	experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1"
+	"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
 	"github.com/kubeflow/katib/pkg/controller.v1beta1/util"
 	katibclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/util/katibclient"
-	batchv1 "k8s.io/api/batch/v1"
-	"k8s.io/api/core/v1"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 )
 
 func TestGetRunSpecWithHP(t *testing.T) {

diff --git a/pkg/controller.v1beta1/experiment/util/status_util.go b/pkg/controller.v1beta1/experiment/util/status_util.go
@@ -38,7 +38,6 @@ const (
 	ExperimentMaxTrialsReachedReason     = "ExperimentMaxTrialsReached"
 	ExperimentSuggestionEndReachedReason = "ExperimentSuggestionEndReached"
 	ExperimentFailedReason               = "ExperimentFailed"
-	ExperimentKilledReason               = "ExperimentKilled"
 )
 
 func UpdateExperimentStatus(collector *ExperimentsCollector, instance *experimentsv1beta1.Experiment, trials *trialsv1beta1.TrialList) error {

diff --git a/pkg/controller.v1beta1/suggestion/suggestion_controller_status.go b/pkg/controller.v1beta1/suggestion/suggestion_controller_status.go
@@ -12,9 +12,7 @@ const (
 	SuggestionDeploymentReady    = "DeploymentReady"
 	SuggestionDeploymentNotReady = "DeploymentNotReady"
 	SuggestionRunningReason      = "SuggestionRunning"
-	SuggestionSucceededReason    = "SuggestionSucceeded"
 	SuggestionFailedReason       = "SuggestionFailed"
-	SuggestionKilledReason       = "SuggestionKilled"
 )
 
 func (r *ReconcileSuggestion) updateStatus(s *suggestionsv1beta1.Suggestion, oldS *suggestionsv1beta1.Suggestion) error {

diff --git a/pkg/controller.v1beta1/suggestion/suggestion_controller_test.go b/pkg/controller.v1beta1/suggestion/suggestion_controller_test.go
@@ -22,7 +22,6 @@ import (
 	"time"
 
 	"github.com/golang/mock/gomock"
-
 	"github.com/onsi/gomega"
 	"golang.org/x/net/context"
 	appsv1 "k8s.io/api/apps/v1"

diff --git a/pkg/controller.v1beta1/suggestion/suggestionclient/suggestionclient_test.go b/pkg/controller.v1beta1/suggestion/suggestionclient/suggestionclient_test.go
@@ -8,24 +8,22 @@ import (
 	"time"
 
 	"github.com/golang/mock/gomock"
+	"github.com/onsi/gomega"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/equality"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
 	commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
 	experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1"
 	suggestionsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1beta1"
 	trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1"
 	suggestionapi "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
-	suggestionapimock "github.com/kubeflow/katib/pkg/mock/v1beta1/api"
-
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-
 	"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
-	"github.com/onsi/gomega"
-	corev1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/equality"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	suggestionapimock "github.com/kubeflow/katib/pkg/mock/v1beta1/api"
 )
 
 type k8sMatcher struct {