Skip to content

Commit

Permalink
update docs
Browse files Browse the repository at this point in the history
Signed-off-by: limengxuan <limengxuan@4paradigm.com>
  • Loading branch information
archlitchi committed Feb 1, 2024
1 parent 2ad91f0 commit 772d153
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 21 deletions.
2 changes: 1 addition & 1 deletion charts/vgpu/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ podSecurityPolicy:
enabled: false

global:
gpuHookPath: /usr/local
gpuHookPath: /usr/local/vgpu
labels: {}
annotations: {}

Expand Down
10 changes: 2 additions & 8 deletions cmd/vGPUmonitor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,11 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
klog.Error(nvml.ErrorString(nvret))
}
memoryUsed := 0
memory, ret := hdev.GetMemoryInfo_v2()
memory, ret := hdev.GetMemoryInfo()
if ret == nvml.SUCCESS {
memoryUsed = int(memory.Used)
} else {
klog.Error("nvml get memory_v2 error ret=", ret)
memory_v1, ret := hdev.GetMemoryInfo()
if ret != nvml.SUCCESS {
klog.Error("nvml get memory error ret=", ret)
} else {
memoryUsed = int(memory_v1.Used)
}
klog.Error("nvml get memory error ret=", ret)
}

uuid, nvret := hdev.GetUUID()
Expand Down
2 changes: 1 addition & 1 deletion hack/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ export COMMIT_CODE
export VERSION="${SHORT_VERSION}-${COMMIT_CODE}"
export LATEST_VERSION="latest"
export GOLANG_IMAGE="golang:1.21-bullseye"
export NVIDIA_IMAGE="nvidia/cuda:11.2.2-base-ubuntu20.04"
export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04"
export DEST_DIR=""

IMAGE=${IMAGE-"4pdosc/k8s-vdevice"}
Expand Down
11 changes: 3 additions & 8 deletions pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,12 @@ func (r *NvidiaDevicePlugin) getApiDevices() *[]*api.DeviceInfo {
panic(0)
}
memoryTotal := 0
memory, ret := ndev.GetMemoryInfo_v2()
memory, ret := ndev.GetMemoryInfo()
if ret == nvml.SUCCESS {
memoryTotal = int(memory.Total)
} else {
klog.Error("nvml get memory_v2 error ret=", ret)
memory_v1, ret := ndev.GetMemoryInfo()
if ret != nvml.SUCCESS {
klog.Error("nvml get memory_v2 error ret=", ret)
panic(0)
}
memoryTotal = int(memory_v1.Total)
klog.Error("nvml get memory error ret=", ret)
panic(0)
}
UUID, ret := ndev.GetUUID()
if ret != nvml.SUCCESS {
Expand Down
6 changes: 3 additions & 3 deletions pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.
os.Chmod("/tmp/vgpulock", 0777)
response.Mounts = append(response.Mounts,
&pluginapi.Mount{ContainerPath: fmt.Sprintf("%s/vgpu/libvgpu.so", hostHookPath),
HostPath: hostHookPath + "/libvgpu.so",
HostPath: hostHookPath + "/vgpu/libvgpu.so",
ReadOnly: true},
&pluginapi.Mount{ContainerPath: fmt.Sprintf("%s/vgpu", hostHookPath),
HostPath: cacheFileHostDirectory,
Expand All @@ -385,7 +385,7 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.
}
if !found {
response.Mounts = append(response.Mounts, &pluginapi.Mount{ContainerPath: "/etc/ld.so.preload",
HostPath: hostHookPath + "/ld.so.preload",
HostPath: hostHookPath + "/vgpu/ld.so.preload",
ReadOnly: true},
)
}
Expand All @@ -398,7 +398,7 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.
})
response.Mounts = append(response.Mounts, &pluginapi.Mount{
ContainerPath: "/usr/bin/vgpuvalidator",
HostPath: fmt.Sprintf("%s/vgpuvalidator", hostHookPath),
HostPath: fmt.Sprintf("%s/vgpu/vgpuvalidator", hostHookPath),
ReadOnly: true,
})
}
Expand Down

0 comments on commit 772d153

Please sign in to comment.