Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up code for custom CRD #1355

Merged
merged 3 commits into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/v1beta1/bayesianoptimization-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/cmaes-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/custom-metricscollector-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ spec:
min: "0.3"
max: "0.7"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/file-metricscollector-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ spec:
min: "0.3"
max: "0.7"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/fpga/xgboost-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ spec:
name: subsample
parameterType: double
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: alpha
description: L1 regularization term on weights
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/grid-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/hyperband-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ spec:
min: "20"
max: "20"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/metric-strategy-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/nas/darts-example-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ spec:
list:
- "3"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: algorithmSettings
description: Algorithm settings of DARTS Experiment
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/nas/darts-example-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ spec:
- "3"
- operationType: skip_connection
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: algorithmSettings
description: Algorithm settings of DARTS Experiment
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/nas/enas-example-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ spec:
max: "3"
step: "1"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: neuralNetworkArchitecture
description: NN architecture contains operations ID on each NN layer and skip connections between layers
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/nas/enas-example-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ spec:
max: "3"
step: "1"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: neuralNetworkArchitecture
description: NN architecture contains operations ID on each NN layer and skip connections between layers
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/pytorchjob-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ spec:
min: "0.5"
max: "0.9"
trialTemplate:
primaryContainerName: pytorch
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/random-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/resume-experiment/from-volume-resume.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/resume-experiment/never-resume.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/tfjob-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ spec:
min: "100"
max: "200"
trialTemplate:
primaryContainerName: tensorflow
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/tpe-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ spec:
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
1 change: 1 addition & 0 deletions examples/v1beta1/trial-metadata-substitution.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ spec:
min: "0.01"
max: "0.03"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
Expand Down
3 changes: 3 additions & 0 deletions manifests/v1beta1/katib-controller/katib-controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ spec:
command: ["./katib-controller"]
args:
- "--webhook-port=8443"
- "--trial-resources=Job.v1.batch"
- "--trial-resources=TFJob.v1.kubeflow.org"
- "--trial-resources=PyTorchJob.v1.kubeflow.org"
- "--trial-resources=MPIJob.v1.kubeflow.org"
ports:
- containerPort: 8443
Expand Down
25 changes: 21 additions & 4 deletions pkg/apis/controller/experiments/v1beta1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,32 @@ limitations under the License.
package v1beta1

const (
// Default value of Spec.ParallelTrialCount
// DefaultTrialParallelCount is the default value of spec.parallelTrialCount.
DefaultTrialParallelCount = 3

// Default value of Spec.ConfigMapName for Trial template
// DefaultTrialConfigMapName is the default value of spec.trialTemplate.configMapName for Trial template.
DefaultTrialConfigMapName = "trial-template"

// Default value of Spec.TemplatePath
// DefaultTrialTemplatePath is the default value of spec.trialTemplate.TemplatePath.
DefaultTrialTemplatePath = "defaultTrialTemplate.yaml"

// Default value of Spec.DefaultResumePolicy
// DefaultResumePolicy is the default value of spec.resumePolicy.
DefaultResumePolicy = LongRunning

// DefaultJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Job.
DefaultJobSuccessCondition = "status.conditions.#(type==\"Complete\")#|#(status==\"True\")#"

// DefaultJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Job.
DefaultJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"

// DefaultKubeflowJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Kubeflow Job.
DefaultKubeflowJobSuccessCondition = "status.conditions.#(type==\"Succeeded\")#|#(status==\"True\")#"

// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Job.
DefaultKubeflowJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
)

var (
// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Job.
DefaultKubeflowJobPrimaryPodLabels = map[string]string{"job-role": "master"}
)
24 changes: 24 additions & 0 deletions pkg/apis/controller/experiments/v1beta1/experiment_defaults.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pkg/apis/controller/experiments/v1beta1/experiment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@ type TrialTemplate struct {
// List of parameters that are used in trial template
TrialParameters []TrialParameterSpec `json:"trialParameters,omitempty"`

// Labels that determines if pod needs to be injected by Katib sidecar container
// Labels that determines if pod needs to be injected by Katib sidecar container.
// If PrimaryPodLabels is omitted, metrics collector wraps all Trial's pods.
PrimaryPodLabels map[string]string `json:"primaryPodLabels,omitempty"`

// Name of training container where actual model training is running
Expand Down
5 changes: 0 additions & 5 deletions pkg/controller.v1beta1/consts/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,6 @@ const (
// JobKindPyTorch is the kind of PyTorchJob.
JobKindPyTorch = "PyTorchJob"

// built-in JobRoles
JobRole = "job-role"
JobRoleTF = "tf-job-role"
JobRolePyTorch = "pytorch-job-role"

// AnnotationIstioSidecarInjectName is the annotation of Istio Sidecar
AnnotationIstioSidecarInjectName = "sidecar.istio.io/inject"

Expand Down
5 changes: 0 additions & 5 deletions pkg/controller.v1beta1/experiment/experiment_consts.go

This file was deleted.

2 changes: 1 addition & 1 deletion pkg/controller.v1beta1/experiment/experiment_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ func (r *ReconcileExperiment) Reconcile(request reconcile.Request) (reconcile.Re
if err != nil {
logger.Error(err, "Reconcile experiment error")
r.recorder.Eventf(instance,
corev1.EventTypeWarning, ReconcileFailedReason,
corev1.EventTypeWarning, consts.ReconcileErrorReason,
"Failed to reconcile: %v", err)
return reconcile.Result{}, err
}
Expand Down
15 changes: 10 additions & 5 deletions pkg/controller.v1beta1/experiment/experiment_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ import (
)

const (
experimentName = "test-experiment"
trialName = "test-trial"
namespace = "default"
experimentName = "test-experiment"
trialName = "test-trial"
namespace = "default"
primaryContainer = "tensorflow"

timeout = time.Second * 40
)
Expand Down Expand Up @@ -372,7 +373,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "tensorflow",
Name: primaryContainer,
Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
Command: []string{
"python",
Expand Down Expand Up @@ -415,6 +416,10 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
},
ResumePolicy: experimentsv1beta1.NeverResume,
TrialTemplate: &experimentsv1beta1.TrialTemplate{
PrimaryPodLabels: experimentsv1beta1.DefaultKubeflowJobPrimaryPodLabels,
PrimaryContainerName: primaryContainer,
SuccessCondition: experimentsv1beta1.DefaultKubeflowJobSuccessCondition,
FailureCondition: experimentsv1beta1.DefaultKubeflowJobFailureCondition,
TrialParameters: []experimentsv1beta1.TrialParameterSpec{
{
Name: "learningRate",
Expand Down Expand Up @@ -509,7 +514,7 @@ func newFakeTFJob() *tfv1.TFJob {
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "tensorflow",
Name: primaryContainer,
Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
Command: []string{
"python",
Expand Down
11 changes: 6 additions & 5 deletions pkg/controller.v1beta1/experiment/manifest/generator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@ package manifest

import (
"errors"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
"math"
"reflect"
"testing"

"github.com/golang/mock/gomock"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"

commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
"github.com/kubeflow/katib/pkg/controller.v1beta1/util"
katibclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/util/katibclient"
batchv1 "k8s.io/api/batch/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
)

func TestGetRunSpecWithHP(t *testing.T) {
Expand Down
1 change: 0 additions & 1 deletion pkg/controller.v1beta1/experiment/util/status_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ const (
ExperimentMaxTrialsReachedReason = "ExperimentMaxTrialsReached"
ExperimentSuggestionEndReachedReason = "ExperimentSuggestionEndReached"
ExperimentFailedReason = "ExperimentFailed"
ExperimentKilledReason = "ExperimentKilled"
)

func UpdateExperimentStatus(collector *ExperimentsCollector, instance *experimentsv1beta1.Experiment, trials *trialsv1beta1.TrialList) error {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ const (
SuggestionDeploymentReady = "DeploymentReady"
SuggestionDeploymentNotReady = "DeploymentNotReady"
SuggestionRunningReason = "SuggestionRunning"
SuggestionSucceededReason = "SuggestionSucceeded"
SuggestionFailedReason = "SuggestionFailed"
SuggestionKilledReason = "SuggestionKilled"
)

func (r *ReconcileSuggestion) updateStatus(s *suggestionsv1beta1.Suggestion, oldS *suggestionsv1beta1.Suggestion) error {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"time"

"github.com/golang/mock/gomock"

"github.com/onsi/gomega"
"golang.org/x/net/context"
appsv1 "k8s.io/api/apps/v1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,22 @@ import (
"time"

"github.com/golang/mock/gomock"
"github.com/onsi/gomega"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1"
suggestionsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1beta1"
trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1"
suggestionapi "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
suggestionapimock "github.com/kubeflow/katib/pkg/mock/v1beta1/api"

"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
"github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
suggestionapimock "github.com/kubeflow/katib/pkg/mock/v1beta1/api"
)

type k8sMatcher struct {
Expand Down
Loading