kubeflow · YuxiJin-tobeyjin · May 11, 2020 · andreyvelich · May 11, 2020 · YuxiJin-tobeyjin
diff --git a/manifests/v1alpha3/katib-controller/rbac.yaml b/manifests/v1alpha3/katib-controller/rbac.yaml
@@ -64,6 +64,7 @@ rules:
   - kubeflow.org
   resources:
   - tfjobs
+  - mpijobs
   - pytorchjobs
   verbs:
   - "*"

diff --git a/pkg/controller.v1alpha3/consts/const.go b/pkg/controller.v1alpha3/consts/const.go
@@ -112,11 +112,14 @@ const (
 	JobKindTF = "TFJob"
 	// JobKindPyTorch is the kind of PyTorchJob.
 	JobKindPyTorch = "PyTorchJob"
+	// JobKindMpi is the kind of MpiJob.
+	JobKindMpi = "MPIJob"
 
 	// built-in JobRoles
 	JobRole        = "job-role"
 	JobRoleTF      = "tf-job-role"
 	JobRolePyTorch = "pytorch-job-role"
+	JobRoleMpi     = "mpi_role_type"
 
 	// AnnotationIstioSidecarInjectName is the annotation of Istio Sidecar
 	AnnotationIstioSidecarInjectName = "sidecar.istio.io/inject"

diff --git a/pkg/job/v1alpha3/kubeflow.go b/pkg/job/v1alpha3/kubeflow.go
@@ -100,4 +100,11 @@ func init() {
 		Kind:    consts.JobKindPyTorch,
 	}
 	JobRoleMap[consts.JobKindPyTorch] = []string{consts.JobRole, consts.JobRolePyTorch}
+	ProviderRegistry[consts.JobKindMpi] = &Kubeflow{}
+	SupportedJobList[consts.JobKindMpi] = schema.GroupVersionKind{
+		Group:   "kubeflow.org",
+		Version: "v1alpha2",
+		Kind:    consts.JobKindMpi,
+	}
+	JobRoleMap[consts.JobKindMpi] = []string{consts.JobRole, consts.JobRoleMpi}
 }
diff --git a/pkg/metricscollector/v1alpha3/common/const.go b/pkg/metricscollector/v1alpha3/common/const.go
@@ -30,7 +30,7 @@ const (
 
 	TrainingCompleted = "completed"
 
-	DefaultFilter = `([\w|-]+)\s*=\s*((-?\d+)(\.\d+)?)`
+	DefaultFilter = `([\w|-]+)\s*[:=]\s*((-?\d+)(\.\d+)?)`
 )
 
 var (

diff --git a/pkg/webhook/v1alpha3/pod/const.go b/pkg/webhook/v1alpha3/pod/const.go
@@ -21,8 +21,9 @@ import (
 )
 
 const (
-	MasterRole = "master"
-	BatchJob   = "Job"
+	MasterRole   = "master"
+	LanucherRole = "launcher"
+	BatchJob     = "Job"
 )
 
 var (

diff --git a/pkg/webhook/v1alpha3/pod/inject_webhook_test.go b/pkg/webhook/v1alpha3/pod/inject_webhook_test.go
@@ -480,6 +480,23 @@ func TestGetKatibJob(t *testing.T) {
 			Err:  true,
 			Name: "Invalid Kind",
 		},
+		{
+			Pod: v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					OwnerReferences: []metav1.OwnerReference{
+						{
+							APIVersion: "batch/v1",
+							Kind:       "Job",
+							Name:       "OwnerName-launcher",
+						},
+					},
+				},
+			},
+			ExpectedJobKind: "Job",
+			ExpectedJobName: "OwnerName",
+			Err:             false,
+			Name:            "Valid Pod",
+		},
 	}
 
 	for _, tc := range testCases {
@@ -498,6 +515,7 @@ func TestGetKatibJob(t *testing.T) {
 func TestIsMasterRole(t *testing.T) {
 	masterRoleLabel := make(map[string]string)
 	masterRoleLabel[consts.JobRole] = MasterRole
+	masterRoleLabel[consts.JobRoleMpi] = LanucherRole
 	invalidLabel := make(map[string]string)
 	invalidLabel["invalid-label"] = "invalid"
 	testCases := []struct {
@@ -521,6 +539,16 @@ func TestIsMasterRole(t *testing.T) {
 			IsMaster: true,
 			Name:     "Pytorch Master Pod",
 		},
+		{
+			Pod: v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: masterRoleLabel,
+				},
+			},
+			JobKind:  "MPIJob",
+			IsMaster: true,
+			Name:     "MPI Launcher Pod",
+		},
 		{
 			Pod: v1.Pod{
 				ObjectMeta: metav1.ObjectMeta{

diff --git a/pkg/webhook/v1alpha3/pod/utils.go b/pkg/webhook/v1alpha3/pod/utils.go
@@ -19,6 +19,7 @@ package pod
 import (
 	"errors"
 	"fmt"
+	"strings"
 
 	"github.com/google/go-containerregistry/pkg/authn"
 	"github.com/google/go-containerregistry/pkg/authn/k8schain"
@@ -37,6 +38,15 @@ func getKatibJob(pod *v1.Pod) (string, string, error) {
 		owners := pod.GetOwnerReferences()
 		for _, owner := range owners {
 			if isMatchGVK(owner, gvk) {
+				if strings.Contains(owner.Name, LanucherRole) {
+					// in fact, launcher pod is owned by job not mpijob directly,
+					// whose name is like "mpi-example-wf2hx8lr-launcher",
+					// consists of "mpiJobName" and "-launcher",
+					// thus its related trialName should git rid of "-launcher".
+					tn := strings.Split(owner.Name, "-")
+					trialName := strings.Join(tn[:len(tn)-1], "-")
+					return owner.Kind, trialName, nil
+				}
 				return owner.Kind, owner.Name, nil
 			}
 		}
@@ -62,7 +72,7 @@ func isMasterRole(pod *v1.Pod, jobKind string) bool {
 		}
 		for _, label := range labels {
 			if v, err := getLabel(pod, label); err == nil {
-				if v == MasterRole {
+				if v == MasterRole || v == LanucherRole {
 					return true
 				}
 			}