apache · tomsun28 · Sep 3, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/home/docs/help/nvidia.md b/home/docs/help/nvidia.md
@@ -0,0 +1,37 @@
+---
+id: nvidia  
+title: NVIDIA Monitoring  
+sidebar_label: NVIDIA Monitoring  
+keywords: [Open Source Monitoring System, NVIDIA Monitoring]
+---
+
+> Collect and monitor general performance metrics of NVIDIA operating systems.
+> NVIDIA monitoring requires the nvidia-smi command, which is installed together with the NVIDIA GPU driver. So when monitoring NVIDIA, we need to install the NVIDIA GPU driver.
+
+### Configuration Parameters
+
+| Parameter Name   | Description                                                 |
+|------------------|-------------------------------------------------------------|
+| Monitoring Host  | The IP address (IPv4/IPv6) or domain name of the monitored endpoint. Note ⚠️ do not include protocol headers (e.g., https://, http://). |
+| Task Name        | The name identifying this monitoring task, which needs to be unique. |
+| Port             | The port exposed for Linux SSH, default is 22.               |
+| Username         | SSH connection username, optional.                           |
+| Password         | SSH connection password, optional.                           |
+| Collection Interval | Interval for periodically collecting monitoring data, in seconds. The minimum interval is 30 seconds. |
+| Probe Before Monitoring | Whether to probe the monitoring endpoint to check its availability before adding it. Monitoring is added or modified only if the probe succeeds. |
+| Description/Remarks | Additional notes and descriptions for this monitoring task. Users can add relevant information here. |
+
+### Collected Metrics
+
+#### Metric Set: basic
+
+| Metric Name            | Unit   | Description      |
+|------------------------|--------|------------------|
+| index                  | None   | GPU index        |
+| name                   | None   | GPU name         |
+| utilization.gpu[%]     | None   | GPU utilization  |
+| utilization.memory[%]  | None   | Memory utilization |
+| memory.total[MiB]      | MiB    | Total memory     |
+| memory.used[MiB]       | MiB    | Used memory      |
+| memory.free[MiB]       | MiB    | Free memory      |
+| temperature.gpu        | None   | GPU temperature  |
diff --git a/home/i18n/zh-cn/docusaurus-plugin-content-docs/current/help/nvidia.md b/home/i18n/zh-cn/docusaurus-plugin-content-docs/current/help/nvidia.md
@@ -0,0 +1,37 @@
+---
+id: nvidia  
+title: 监控：NVIDIA监控      
+sidebar_label: NVIDIA监控      
+keywords: [开源监控系统, NVIDIA监控]
+---
+
+> 对NVIDIA操作系统的通用性能指标进行采集监控。
+> NVIDIA监控需要用到 nvidia-smi 命令，nvidia-smi 是与 NVIDIA GPU 驱动程序一起安装的。所以在监控NVIDIA时，我们需要安装 NVIDIA GPU 驱动程序。
+
+### 配置参数
+
+|  参数名称  |                        参数帮助描述                        |
+|--------|------------------------------------------------------|
+| 监控Host | 被监控的对端IPV4，IPV6或域名。注意⚠️不带协议头(eg: https://, http://)。 |
+| 任务名称   | 标识此监控的名称，名称需要保证唯一性。                                  |
+| 端口     | Linux SSH对外提供的端口，默认为22。                              |
+| 用户名    | SSH连接用户名，可选                                          |
+| 密码     | SSH连接密码，可选                                           |
+| 采集间隔   | 监控周期性采集数据间隔时间，单位秒，可设置的最小间隔为30秒                       |
+| 是否探测   | 新增监控前是否先探测检查监控可用性，探测成功才会继续新增修改操作                     |
+| 描述备注   | 更多标识和描述此监控的备注信息，用户可以在这里备注信息                          |
+
+### 采集指标
+
+#### 指标集合：basic
+
+| 指标名称               | 指标单位 | 指标帮助描述 |
+|--------------------|------|--------|
+| index              | 无    | 显卡索引   |
+| name     | 无    | 显卡名称 |
+| utilization.gpu[%]    | 无    | GPU利用率 |
+| utilization.memory[%] | 无    | 显存利用率 |
+| memory.total[MiB]       | 无    | 总显存 |
+| memory.used[MiB]        | 无    | 已用显存 |
+| memory.free[MiB]        | 无    | 空闲显存 |
+| temperature.gpu    | 无    | 显卡温度 |
diff --git a/manager/src/main/resources/define/app-nvidia.yml b/manager/src/main/resources/define/app-nvidia.yml
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The monitoring type category：service-application service monitoring db-database monitoring custom-custom monitoring os-operating system monitoring
+category: server
+# The monitoring type eg: linux windows tomcat mysql aws...
+app: nvidia
+# The monitoring i18n name
+name:
+  zh-CN: NVIDIA
+  en-US: NVIDIA
+# The description and help of this monitoring type
+help:
+  zh-CN: Hertzbeat 使用 <a class='help_module_content' href='https://hertzbeat.apache.org/docs/advanced/extend-ssh'> SSH 协议</a> 对 NVIDIA GPU显卡的通用性能指标进行采集监控。<br>您可以点击“<i>新建 NVIDIA</i>”并配置HOST端口账户等相关参数进行添加，支持SSH账户密码或密钥认证。或者选择“<i>更多操作</i>”，导入已有配置。
+  en-US: Hertzbeat uses the <a class='help_module_content' href='https://hertzbeat.apache.org/docs/advanced/extend-ssh'>SSH protocol</a> to collect and monitor general performance metrics of NVIDIA GPUs. <br> You can click " <i>Create NVIDIA</i> " to add and configure parameters such as HOST, port, account, etc., supporting SSH account password or key authentication. Alternatively, you can select " <i>More Actions</i> " to import an existing configuration.
+  zh-TW: Hertzbeat 使用 <a class='help_module_content' href='https://hertzbeat.apache.org/docs/advanced/extend-ssh'>SSH 協議</a> 對 NVIDIA GPU 顯卡的通用性能指標進行採集監控。<br>您可以點擊“<i>新建 NVIDIA</i>”並配置 HOST、端口、帳戶等相關參數進行添加，支持 SSH 帳戶密碼或密鑰認證。或者選擇“<i>更多操作</i>”，導入已有配置。
+helpLink:
+  zh-CN: https://hertzbeat.apache.org/zh-cn/docs/help/nvidia/
+  en-US: https://hertzbeat.apache.org/docs/help/nvidia/
+# Input params define for monitoring(render web ui by the definition)
+params:
+  # field-param field key
+  - field: host
+    # name-param field display i18n name
+    name:
+      zh-CN: 目标Host
+      en-US: Target Host
+    # type-param field type(most mapping the html input type)
+    type: host
+    # required-true or false
+    required: true
+  # field-param field key
+  - field: port
+    # name-param field display i18n name
+    name:
+      zh-CN: 端口
+      en-US: Port
+    # type-param field type(most mapping the html input type)
+    type: number
+    # when type is number, range is required
+    range: '[0,65535]'
+    # required-true or false
+    required: true
+    # default value
+    defaultValue: 22
+  # field-param field key
+  - field: timeout
+    # name-param field display i18n name
+    name:
+      zh-CN: 超时时间(ms)
+      en-US: Timeout(ms)
+    # type-param field type(most mapping the html input type)
+    type: number
+    # when type is number, range is required
+    range: '[400,200000]'
+    # required-true or false
+    required: false
+    # default value
+    # 默认值
+    defaultValue: 6000
+  # field-param field key
+  - field: reuseConnection
+    # name-param field display i18n name
+    name:
+      zh-CN: 复用连接
+      en-US: Reuse Connection
+    # type-param field type(most mapping the html input type)
+    type: boolean
+    # required-true or false
+    required: true
+    defaultValue: false
+  # field-param field key
+  - field: username
+    # name-param field display i18n name
+    name:
+      zh-CN: 用户名
+      en-US: Username
+    # type-param field type(most mapping the html input type)
+    type: text
+    # when type is text, use limit to limit string length
+    limit: 50
+    # required-true or false
+    required: true
+  # field-param field key
+  - field: password
+    # name-param field display i18n name
+    name:
+      zh-CN: 密码
+      en-US: Password
+    # type-param field type(most mapping the html input tag)
+    type: password
+    # required-true or false
+    required: false
+  # field-param field key
+  - field: privateKey
+    # name-param field display i18n name
+    name:
+      zh-CN: 私钥
+      en-US: PrivateKey
+    # type-param field type(most mapping the html input type)
+    type: textarea
+    placeholder: -----BEGIN RSA PRIVATE KEY-----
+    # required-true or false
+    required: false
+    # hide param-true or false
+    hide: true
+# collect metrics config list
+metrics:
+  # metrics - basic, inner monitoring metrics (responseTime - response time)
+  - name: basic
+    i18n:
+      zh-CN: 显卡基本信息
+      en-US: Basic Information
+    # metrics scheduling priority(0->127)->(high->low), metrics with the same priority will be scheduled in parallel
+    # priority 0's metrics is availability metrics, it will be scheduled first, only availability metrics collect success will the scheduling continue
+    priority: 0
+    # collect metrics content
+    fields:
+      # field-metric name, type-metric type(0-number,1-string), unit-metric unit('%','ms','MB'), label-whether it is a metrics label field
+      - field: index
+        type: 1
+        label: true
+        i18n:
+          zh-CN: 显卡索引
+          en-US: Host Name
+      - field: name
+        type: 1
+        i18n:
+          zh-CN: 显卡名称
+          en-US: System Version
+      - field: utilization.gpu [%]
+        type: 0
+        unit: '%'
+        i18n:
+          zh-CN: GPU利用率
+          en-US: GPU Utilization
+      - field: utilization.memory [%]
+        type: 0
+        unit: '%'
+        i18n:
+          zh-CN: 显存利用率
+          en-US: Memory Utilization
+      - field: memory.total [MiB]
+        type: 1
+        unit: 'MiB'
+        i18n:
+          zh-CN: 总显存
+          en-US: Total Memory
+      - field: memory.used [MiB]
+        type: 0
+        unit: 'MiB'
+        i18n:
+          zh-CN: 已用显存
+          en-US: Used Memory
+      - field: memory.free [MiB]
+        type: 0
+        unit: 'MiB'
+        i18n:
+          zh-CN: 空闲显存
+          en-US: Free Memory
+      - field: temperature.gpu
+        type: 1
+        unit: '°C'
+        i18n:
+          zh-CN: 显卡温度
+          en-US: GPU Temperature
+    # the protocol used for monitoring, eg: sql, ssh, http, telnet, wmi, snmp, sdk
+    protocol: ssh
+    # the config content when protocol is ssh
+    ssh:
+      # ssh host: ipv4 ipv6 domain
+      host: ^_^host^_^
+      # ssh port
+      port: ^_^port^_^
+      # ssh username
+      username: ^_^username^_^
+      # ssh password
+      password: ^_^password^_^
+      # ssh private key
+      privateKey: ^_^privateKey^_^
+      timeout: ^_^timeout^_^
+      reuseConnection: ^_^reuseConnection^_^
+      # ssh run collect script
+      script: nvidia-smi --query-gpu=index,name,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,temperature.gpu --format=csv,nounits | sed 's/ *, */,/g' | sed 's/ / /g' | sed 's/,/ /g'
+      # ssh response data parse type: oneRow, multiRow
+      parseType: multiRow