Skip to content

Commit

Permalink
Merge branch 'master' into fix/leaderelection_signal_ctx
Browse files Browse the repository at this point in the history
  • Loading branch information
rfyiamcool authored Aug 8, 2023
2 parents e7ffd2b + d57f9f3 commit 2477fd3
Show file tree
Hide file tree
Showing 74 changed files with 784 additions and 321 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,4 @@ coverage.txt
vendor

# helm dependency files
installer/helm/chart/volcano/charts/
installer/helm/chart/volcano/requirements.lock
1 change: 1 addition & 0 deletions OWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ reviewers:
- hwdef
- Yikun
- jiangkaihua
- wangyang0616
approvers:
- k82cn
- kevin-wangzefeng
Expand Down
25 changes: 11 additions & 14 deletions cmd/controller-manager/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,20 @@ package options
import (
"fmt"
"os"
"time"

"github.com/spf13/pflag"

"volcano.sh/volcano/pkg/kube"
)

const (
defaultQPS = 50.0
defaultBurst = 100
defaultWorkers = 3
defaultMaxRequeueNum = 15
defaultSchedulerName = "volcano"
defaultHealthzAddress = ":11251"
defaultDetectionPeriodOfDependsOntask = 100 * time.Millisecond
defaultLockObjectNamespace = "volcano-system"
defaultQPS = 50.0
defaultBurst = 100
defaultWorkers = 3
defaultMaxRequeueNum = 15
defaultSchedulerName = "volcano"
defaultHealthzAddress = ":11251"
defaultLockObjectNamespace = "volcano-system"
)

// ServerOption is the main context object for the controllers.
Expand All @@ -60,11 +58,11 @@ type ServerOption struct {
// defaulting to 0.0.0.0:11252
HealthzBindAddress string
EnableHealthz bool
// For dependent tasks, there is a detection cycle inside volcano
// It indicates how often to detect the status of dependent tasks
DetectionPeriodOfDependsOntask time.Duration
// To determine whether inherit owner's annotations for pods when create podgroup
InheritOwnerAnnotations bool
// WorkerThreadsForPG is the number of threads syncing podgroup operations
// The larger the number, the faster the podgroup processing, but requires more CPU load.
WorkerThreadsForPG uint32
}

type DecryptFunc func(c *ServerOption) error
Expand Down Expand Up @@ -94,9 +92,8 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
fs.IntVar(&s.MaxRequeueNum, "max-requeue-num", defaultMaxRequeueNum, "The number of times a job, queue or command will be requeued before it is dropped out of the queue")
fs.StringVar(&s.HealthzBindAddress, "healthz-address", defaultHealthzAddress, "The address to listen on for the health check server.")
fs.BoolVar(&s.EnableHealthz, "enable-healthz", false, "Enable the health check; it is false by default")
fs.DurationVar(&s.DetectionPeriodOfDependsOntask, "detection-period-of-dependson-task", defaultDetectionPeriodOfDependsOntask, "It indicates how often to detect the status of dependent tasks."+
"e.g. --detection-period-of-dependson-task=1s")
fs.BoolVar(&s.InheritOwnerAnnotations, "inherit-owner-annotations", true, "Enable inherit owner annotations for pods when create podgroup; it is enabled by default")
fs.Uint32Var(&s.WorkerThreadsForPG, "worker-threads-for-podgroup", 1, "The number of threads syncing podgroup operations. The larger the number, the faster the podgroup processing, but requires more CPU load.")
}

// CheckOptionOrDie checks the LockObjectNamespace.
Expand Down
18 changes: 9 additions & 9 deletions cmd/controller-manager/app/options/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ func TestAddFlags(t *testing.T) {
QPS: defaultQPS,
Burst: 200,
},
PrintVersion: false,
WorkerThreads: defaultWorkers,
SchedulerNames: []string{"volcano", "volcano2"},
MaxRequeueNum: defaultMaxRequeueNum,
HealthzBindAddress: ":11251",
DetectionPeriodOfDependsOntask: defaultDetectionPeriodOfDependsOntask,
InheritOwnerAnnotations: true,
EnableLeaderElection: true,
LockObjectNamespace: defaultLockObjectNamespace,
PrintVersion: false,
WorkerThreads: defaultWorkers,
SchedulerNames: []string{"volcano", "volcano2"},
MaxRequeueNum: defaultMaxRequeueNum,
HealthzBindAddress: ":11251",
InheritOwnerAnnotations: true,
EnableLeaderElection: true,
LockObjectNamespace: defaultLockObjectNamespace,
WorkerThreadsForPG: 1,
}

if !reflect.DeepEqual(expected, s) {
Expand Down
6 changes: 2 additions & 4 deletions cmd/controller-manager/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ import (
vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
"volcano.sh/volcano/cmd/controller-manager/app/options"
"volcano.sh/volcano/pkg/controllers/framework"
"volcano.sh/volcano/pkg/controllers/job"
"volcano.sh/volcano/pkg/kube"
"volcano.sh/volcano/pkg/signals"
)
Expand All @@ -62,8 +61,6 @@ func Run(opt *options.ServerOption) error {
}
}

job.SetDetectionPeriodOfDependsOntask(opt.DetectionPeriodOfDependsOntask)

run := startControllers(config, opt)

ctx := signals.SetupSignalContext()
Expand All @@ -90,7 +87,7 @@ func Run(opt *options.ServerOption) error {
// add a uniquifier so that two processes on the same host don't accidentally both become active
id := hostname + "_" + string(uuid.NewUUID())

rl, err := resourcelock.New(resourcelock.ConfigMapsLeasesResourceLock,
rl, err := resourcelock.New(resourcelock.LeasesResourceLock,
opt.LockObjectNamespace,
"vc-controller-manager",
leaderElectionClient.CoreV1(),
Expand Down Expand Up @@ -130,6 +127,7 @@ func startControllers(config *rest.Config, opt *options.ServerOption) func(ctx c
controllerOpt.VolcanoClient = vcclientset.NewForConfigOrDie(config)
controllerOpt.SharedInformerFactory = informers.NewSharedInformerFactory(controllerOpt.KubeClient, 0)
controllerOpt.InheritOwnerAnnotations = opt.InheritOwnerAnnotations
controllerOpt.WorkerThreadsForPG = opt.WorkerThreadsForPG

return func(ctx context.Context) {
framework.ForeachController(func(c framework.Controller) {
Expand Down
2 changes: 1 addition & 1 deletion cmd/scheduler/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func Run(opt *options.ServerOption) error {
// add a uniquifier so that two processes on the same host don't accidentally both become active
id := hostname + "_" + string(uuid.NewUUID())

rl, err := resourcelock.New(resourcelock.ConfigMapsLeasesResourceLock,
rl, err := resourcelock.New(resourcelock.LeasesResourceLock,
opt.LockObjectNamespace,
commonutil.GenerateComponentName(opt.SchedulerNames),
leaderElectionClient.CoreV1(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ When controller need create a podgroup, it will check the pod `ownerReferences`

### DiscoveryClient & DynamicClient
From the `ownerReferences`, we can get or create a GVK and owner resource name.
Discovery client is foucs on the k8s resources, it can get GVR from GVK. When get GVR and owner resource name, we can use Dynamic client to get the owner resource's `ownerReferences`.
Discovery client is focused on the k8s resources, it can get GVR from GVK. When get GVR and owner resource name, we can use Dynamic client to get the owner resource's `ownerReferences`.

### RBAC

Expand Down
33 changes: 24 additions & 9 deletions docs/design/custom-plugin.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ tar -xf musl-1.2.1.tar.gz && cd musl-1.2.1
make && sudo make install

# build plugin
CC=/usr/local/musl/bin/musl-gcc CGO_ENABLED=1 go build -buildmode=plugin magic.go
CC=/usr/local/musl/bin/musl-gcc CGO_ENABLED=1 go build -o plugins/magic.so -buildmode=plugin magic.go
```

#### B. Use gnu-libc build plugin
Expand All @@ -90,39 +90,54 @@ you can just build the plugin in local.

```bash
# default CC is gcc
CGO_ENABLED=1 go build -buildmode=plugin magic.go
CGO_ENABLED=1 go build -o plugins/magic.so -buildmode=plugin magic.go
```

### 4. Add plugins into container

Your can build your docker image

```dockerfile
FROM volcano.sh/vc-scheduler:${VERSION}
#Dockerfile
FROM volcanosh/vc-scheduler:latest

COPY plugins plugins
```

```
docker build -t volcanosh/vc-scheduler:magic-plugins .
```



Or just use `pvc` to mount these plugins

### 4. Specify deployment
```yaml
...
containers:
- name: volcano-scheduler
image: volcano.sh/vc-scheduler:${VERSION}
image: volcanosh/vc-scheduler:magic-plugins
args:
- --logtostderr
- --scheduler-conf=/volcano.scheduler/volcano-scheduler.conf
- -v=3
- --plugins-dir=plugins # specify plugins dir path
- 2>&1
- --logtostderr
- --scheduler-conf=/volcano.scheduler/volcano-scheduler.conf
- --enable-healthz=true
- --enable-metrics=true
- -v=3
- --plugins-dir=plugins # specify plugins dir path
- 2>&1
```
### 5. Update volcano-scheduler-configmap
Add your custom plugin name in configmap
```
kubectl edit cm volcano-scheduler-configmap -n volcano-system
```



```yaml
apiVersion: v1
kind: ConfigMap
Expand Down
21 changes: 20 additions & 1 deletion docs/design/device-sharing.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,26 @@ type Devices interface {
//HasDeviceRequest checks if the 'pod' request this device
HasDeviceRequest(pod *v1.Pod) bool
//FiltreNode checks if the 'pod' fit in current node
FilterNode(pod *v1.Pod) (bool, error)
// The first return value represents the filtering result, and the value range is "0, 1, 2, 3"
// 0: Success
// Success means that plugin ran correctly and found pod schedulable.
// 1: Error
// Error is used for internal plugin errors, unexpected input, etc.
// 2: Unschedulable
// Unschedulable is used when a plugin finds a pod unschedulable. The scheduler might attempt to
// preempt other pods to get this pod scheduled. Use UnschedulableAndUnresolvable to make the
// scheduler skip preemption.
// The accompanying status message should explain why the pod is unschedulable.
// 3: UnschedulableAndUnresolvable
// UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and
// preemption would not change anything. Plugins should return Unschedulable if it is possible
// that the pod can get scheduled with preemption.
// The accompanying status message should explain why the pod is unschedulable.
FilterNode(pod *v1.Pod) (int, string, error)
//Allocate action in predicate
Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error
//Release action in predicate
Expand Down
Binary file modified docs/design/images/jobflow-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed docs/design/images/jobflow-2.jpg
Binary file not shown.
Binary file added docs/design/images/jobflow-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/design/images/jobflow-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed docs/design/images/jobflow-4.png
Binary file not shown.
10 changes: 4 additions & 6 deletions docs/design/jobflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,13 @@ JobFlow helps migrating AI, BigData, HPC workloads to the cloud-native world. Th
- Some jobs need to depend on the completion of the previous job or other status when running, etc. Otherwise, the correct result cannot be calculated.
- Sometimes inter-job dependencies also require diverse dependency types, such as conditional dependencies, circular dependencies, probes, and so on.

![jobflow-2.jpg](../images/jobflow-2.jpg)
![jobflow-1.png](../images/jobflow-1.png)

## Design

![jobflow-1.png](../images/jobflow-1.png)
![jobflow-2.png](../images/jobflow-2.png)

![jobflow-3.png](../images/jobflow-3.png)

The blue part is the component of k8s itself, the green and brown are the components of volcano, and the yellow is the crd resource of volcano.
The blue part is the components of k8s itself, the orange is the existing definition of Volcano, and the red is the new definition of JobFlow.

**jobflow job submission complete process**

Expand All @@ -58,7 +56,7 @@ Deleting a jobflow when the jobflow is in a non-complete state will be intercept

### Controller

![jobflow-4.png](../images/jobflow-4.png)
![jobflow-3.png](../images/jobflow-3.png)

### Webhook

Expand Down
2 changes: 2 additions & 0 deletions example/custom-plugin/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FROM volcanosh/vc-scheduler:latest
COPY plugins plugins
6 changes: 5 additions & 1 deletion example/extender/extender.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/json"
"io/ioutil"
"net/http"

"volcano.sh/volcano/pkg/scheduler/api"
"volcano.sh/volcano/pkg/scheduler/plugins/extender"
)
Expand Down Expand Up @@ -71,7 +72,10 @@ func predicate(w http.ResponseWriter, r *http.Request) {

resp := &extender.PredicateResponse{}
if req.Task.BestEffort && len(req.Node.Tasks) > 10 {
resp.ErrorMessage = "Too many tasks on the node"
sts := api.Status{}
sts.Code = api.Unschedulable
sts.Reason = "Too many tasks on the node"
resp.Status = append(resp.Status, &sts)
}
response, err := json.Marshal(resp)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module volcano.sh/volcano
go 1.19

require (
github.com/agiledragon/gomonkey/v2 v2.1.0
github.com/agiledragon/gomonkey/v2 v2.2.0
github.com/elastic/go-elasticsearch/v7 v7.17.7
github.com/fsnotify/fsnotify v1.5.4
github.com/golang/mock v1.6.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tN
github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/agiledragon/gomonkey/v2 v2.1.0 h1:+5Dbq8a1fn89IgVk35O233R41FH0nBKFPn50wDZpNs0=
github.com/agiledragon/gomonkey/v2 v2.1.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY=
github.com/agiledragon/gomonkey/v2 v2.2.0 h1:QJWqpdEhGV/JJy70sZ/LDnhbSlMrqHAWHcNOjz1kyuI=
github.com/agiledragon/gomonkey/v2 v2.2.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
Expand Down
5 changes: 2 additions & 3 deletions hack/generate-yaml.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ fi
# Step2. update helm templates from config dir
HELM_TEMPLATES_DIR=${VK_ROOT}/installer/helm/chart/volcano/templates
HELM_VOLCANO_CRD_DIR=${VK_ROOT}/installer/helm/chart/volcano/crd
HELM_JOBFLOW_CRD_DIR=${VK_ROOT}/installer/helm/chart/jobflow/crd
HELM_JOBFLOW_CRD_DIR=${VK_ROOT}/installer/helm/chart/volcano/charts/jobflow/crd
VOLCANO_CRD_DIR=${VK_ROOT}/config/crd/volcano
JOBFLOW_CRD_DIR=${VK_ROOT}/config/crd/jobflow
echo Updating templates in $HELM_TEMPLATES_DIR
Expand Down Expand Up @@ -124,7 +124,6 @@ fi
cat ${VK_ROOT}/installer/namespace.yaml > ${DEPLOYMENT_FILE}

# Volcano
${HELM_BIN_DIR}/helm dependency update ${VK_ROOT}/installer/helm/chart/volcano
${HELM_BIN_DIR}/helm template ${VK_ROOT}/installer/helm/chart/volcano --namespace volcano-system \
--name-template volcano --set basic.image_tag_version=${VOLCANO_IMAGE_TAG} --set basic.crd_version=${CRD_VERSION}\
-s templates/admission.yaml \
Expand All @@ -139,7 +138,7 @@ ${HELM_BIN_DIR}/helm template ${VK_ROOT}/installer/helm/chart/volcano --namespac
>> ${DEPLOYMENT_FILE}

# JobFlow
${HELM_BIN_DIR}/helm template ${VK_ROOT}/installer/helm/chart/jobflow --namespace volcano-system \
${HELM_BIN_DIR}/helm template ${VK_ROOT}/installer/helm/chart/volcano/charts/jobflow --namespace volcano-system \
--name-template volcano --set basic.image_tag_version=${VOLCANO_IMAGE_TAG} --set basic.crd_version=${CRD_VERSION}\
-s templates/flow_v1alpha1_jobflows.yaml \
-s templates/flow_v1alpha1_jobtemplates.yaml \
Expand Down
7 changes: 4 additions & 3 deletions hack/local-up-volcano.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ YAML_FILENAME=volcano-${TAG}.yaml
function prepare {
echo "Preparing..."
install-helm
echo "Generating volcano deploy yaml"
make generate-yaml

echo "Building docker images"
make images
Expand All @@ -41,7 +39,10 @@ function prepare {

function install-volcano {
# TODO: add a graceful way waiting for all crd ready
kubectl apply -f ${RELEASE_FOLDER}/${YAML_FILENAME}
kubectl create namespace volcano-system
helm install volcano ${VK_ROOT}/installer/helm/chart/volcano --namespace volcano-system \
--set basic.image_tag_version=${TAG} \
--set basic.image_pull_policy=IfNotPresent
}

function uninstall-volcano {
Expand Down
1 change: 0 additions & 1 deletion hack/run-e2e-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ function install-volcano {
kubectl apply -f installer/namespace.yaml

echo "Install volcano chart with crd version $crd_version"
helm dependency update installer/helm/chart/volcano
helm install ${CLUSTER_NAME} installer/helm/chart/volcano --namespace volcano-system --kubeconfig ${KUBECONFIG} \
--set basic.image_pull_policy=IfNotPresent \
--set basic.image_tag_version=${TAG} \
Expand Down
24 changes: 0 additions & 24 deletions installer/helm/chart/jobflow/Chart.yaml

This file was deleted.

Loading

0 comments on commit 2477fd3

Please sign in to comment.