diff --git a/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java b/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java index f636043ca46..73c8beab86c 100644 --- a/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java +++ b/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java @@ -41,6 +41,7 @@ import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; /** * Indicator group collection task and response data scheduler @@ -189,7 +190,7 @@ public void dispatchCollectData(Timeout timeout, Metrics metrics, CollectRep.Met metricsTimeoutMonitorMap.remove(job.getId() + "-" + metrics.getName() + "-sub-" + metrics.getSubTaskId()); boolean isLastTask = metrics.consumeSubTaskResponse(metricsData); if (isLastTask) { - metricsData = metrics.getSubTaskDataTmp(); + metricsData = metrics.getSubTaskDataRef().get(); } else { return; } @@ -236,20 +237,21 @@ public void dispatchCollectData(Timeout timeout, Metrics metrics, CollectRep.Met // use pre collect metrics data to replace next metrics config params List> configmapList = getConfigmapFromPreCollectData(metricsData); metricsSet.forEach(metricItem -> { - JsonElement jsonElement = GSON.toJsonTree(metricItem); - if (configmapList != null && !configmapList.isEmpty() && CollectUtil.containCryPlaceholder(jsonElement)) { + if (configmapList != null && !configmapList.isEmpty() && CollectUtil.containCryPlaceholder(GSON.toJsonTree(metricItem))) { AtomicInteger subTaskNum = new AtomicInteger(configmapList.size()); + AtomicReference metricsDataReference = new AtomicReference<>(); for (int index = 0; index < configmapList.size(); index ++) { Map configmap = configmapList.get(index); - jsonElement = GSON.toJsonTree(metricItem); - CollectUtil.replaceCryPlaceholder(jsonElement, configmap); - metricItem = GSON.fromJson(jsonElement, Metrics.class); - metricItem.setSubTaskNum(subTaskNum); - metricItem.setSubTaskId(index); - MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this, unitConvertList); + JsonElement metricJson = GSON.toJsonTree(metricItem); + CollectUtil.replaceCryPlaceholder(metricJson, configmap); + Metrics metric = GSON.fromJson(metricJson, Metrics.class); + metric.setSubTaskNum(subTaskNum); + metric.setSubTaskId(index); + metric.setSubTaskDataRef(metricsDataReference); + MetricsCollect metricsCollect = new MetricsCollect(metric, timeout, this, unitConvertList); jobRequestQueue.addJob(metricsCollect); - metricsTimeoutMonitorMap.put(job.getId() + "-" + metricItem.getName() + "-sub-" + index, - new MetricsTime(System.currentTimeMillis(), metricItem, timeout)); + metricsTimeoutMonitorMap.put(job.getId() + "-" + metric.getName() + "-sub-" + index, + new MetricsTime(System.currentTimeMillis(), metric, timeout)); } } else { MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this, unitConvertList); diff --git a/common/src/main/java/com/usthe/common/entity/job/Metrics.java b/common/src/main/java/com/usthe/common/entity/job/Metrics.java index 3a42de41f82..4a3c2395949 100644 --- a/common/src/main/java/com/usthe/common/entity/job/Metrics.java +++ b/common/src/main/java/com/usthe/common/entity/job/Metrics.java @@ -31,6 +31,7 @@ import java.util.List; import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; /** * Details of the collection of indicators collected by monitoring @@ -165,7 +166,7 @@ public class Metrics { * collector使用 - 临时存储分级任务指标响应数据 */ @JsonIgnore - private transient CollectRep.MetricsData subTaskDataTmp; + private transient AtomicReference subTaskDataRef; /** * collector use - Temporarily store subTask running num @@ -200,11 +201,11 @@ public boolean consumeSubTaskResponse(CollectRep.MetricsData metricsData) { } synchronized (subTaskNum) { int index = subTaskNum.decrementAndGet(); - if (subTaskDataTmp == null) { - subTaskDataTmp = metricsData; + if (subTaskDataRef.get() == null) { + subTaskDataRef.set(metricsData); } else { - if (metricsData.getValuesCount() > 1) { - CollectRep.MetricsData.Builder dataBuilder = CollectRep.MetricsData.newBuilder(subTaskDataTmp); + if (metricsData.getValuesCount() >= 1) { + CollectRep.MetricsData.Builder dataBuilder = CollectRep.MetricsData.newBuilder(subTaskDataRef.get()); for (CollectRep.ValueRow valueRow : metricsData.getValuesList()) { if (valueRow.getColumnsCount() == dataBuilder.getFieldsCount()) { dataBuilder.addValues(valueRow); @@ -212,7 +213,7 @@ public boolean consumeSubTaskResponse(CollectRep.MetricsData metricsData) { log.error("consume subTask data value not mapping filed"); } } - subTaskDataTmp = dataBuilder.build(); + subTaskDataRef.set(dataBuilder.build()); } } return index == 0; diff --git a/home/docs/help/docker.md b/home/docs/help/docker.md new file mode 100644 index 00000000000..28bc13c39db --- /dev/null +++ b/home/docs/help/docker.md @@ -0,0 +1,106 @@ +--- +id: docker +title: 监控:Docker 监控 +sidebar_label: Docker 容器监控 + +--- + +> 对Docker容器的通用性能指标进行采集监控。 + + +## 监控前操作 + +如果想要监控 `Docker` 中的容器信息,则需要按照一下步骤打开端口,让采集请求获取到对应的信息。 + +**1、编辑docker.server文件:** + +```shell +vi /usr/lib/systemd/system/docker.service +``` + +找到 **[Service]** 节点,修改 ExecStart 属性,增加 `-H tcp://0.0.0.0:2375` + +```shell +ExecStart=/usr/bin/dockerd -H fd:// --containerd=/run/containerd/containerd.sock -H tcp://0.0.0.0:2375 +``` + +这样相当于对外开放的是 **2375** 端口,当然也可以根据自己情况修改成其他的。 + +**2、重新加载Docker配置生效:** + +```shell +systemctl daemon-reload +systemctl restart docker +``` + +**注意:记得在服务器中台打开 `2375` 端口号。** + +**3、如果上述方法不行则:** + +在服务器内部打开 `2375` 端口号。 + +```shell +firewall-cmd --zone=public --add-port=2375/tcp --permanent +firewall-cmd --reload +``` + + + + + +### 配置参数 + +| 参数名称 | 参数帮助描述 | +| ------------ | ------------------------------------------------------------ | +| 监控Host | 被监控的对端IPV4,IPV6或域名。注意⚠️不带协议头(eg: https://, http://)。 | +| 监控名称 | 标识此监控的名称,名称需要保证唯一性。 | +| 端口 | 数据库对外提供的端口,默认为2375。 | +| 查询超时时间 | 设置获取Docker服务器API接口时的超时时间,单位ms毫秒,默认3000毫秒。 | +| 器名称 | 一般是监控所有运行中的容器信息。 | +| 用户名 | 连接用户名,可选 | +| 密码 | 连接密码,可选 | +| URL | 数据库连接URL,可选,若配置,则URL里面的数据库名称,用户名密码等参数会覆盖上面配置的参数 | +| 采集间隔 | 监控周期性采集数据间隔时间,单位秒,可设置的最小间隔为10秒 | +| 是否探测 | 新增监控前是否先探测检查监控可用性,探测成功才会继续新增修改操作 | +| 描述备注 | 更多标识和描述此监控的备注信息,用户可以在这里备注信息 | + +### 采集指标 + +#### 指标集合:system + +| 指标名称 | 指标单位 | 指标帮助描述 | +| ------------------ | -------- | -------------------------------------- | +| Name | 无 | 服务器名称 | +| version | 无 | docker本版号 | +| os | 无 | 服务器版本 例如:linux x86_64 | +| root_dir | 无 | docker文件夹目录 例如:/var/lib/docker | +| containers | 无 | 容器总数(在运行+未运行) | +| containers_running | 无 | 运行中的容器数目 | +| containers_paused | 无 | 暂停中的容器数目 | +| images | 无 | 容器景象的总数目。 | +| ncpu | 无 | NCPU | +| mem_total | MB | 占用的内存总大小 | +| system_time | 无 | 系统时间 | + +#### 指标集合:containers + +| 指标名称 | 指标单位 | 指标帮助描述 | +| -------- | -------- | ---------------------- | +| id | 无 | Docker中容器的ID | +| name | 无 | Docker容器中的容器名称 | +| image | 无 | Docker容器使用的镜像 | +| command | 无 | Docker中的默认启动命令 | +| state | 无 | Docker中容器的运行状态 | +| status | 无 | Docker容器中的更新时间 | + +#### 指标集合:stats + +| 指标名称 | 指标单位 | 指标帮助描述 | +| ---------------- | -------- | ---------------------------- | +| name | 无 | Docker容器中的名字 | +| available_memory | MB | Docker容器可以利用的内存大小 | +| used_memory | MB | Docker容器已经使用的内存大小 | +| memory_usage | 无 | Docker容器的内存使用率 | +| cpu_delta | 无 | Docker容器已经使用的CPU数量 | +| number_cpus | 无 | Docker容器可以使用的CPU数量 | +| cpu_usage | 无 | Docker容器CPU使用率 | diff --git a/home/i18n/en/docusaurus-plugin-content-docs/current.json b/home/i18n/en/docusaurus-plugin-content-docs/current.json index ea53353fdd5..5dd469c7cf7 100644 --- a/home/i18n/en/docusaurus-plugin-content-docs/current.json +++ b/home/i18n/en/docusaurus-plugin-content-docs/current.json @@ -54,5 +54,9 @@ "sidebar.docs.category.Others": { "message": "Others", "description": "The label for category Others in sidebar docs" + }, + "sidebar.docs.category.云原生": { + "message": "CloudNative", + "description": "The label for category 云原生 in sidebar docs" } } diff --git a/home/i18n/en/docusaurus-plugin-content-docs/current/help/docker.md b/home/i18n/en/docusaurus-plugin-content-docs/current/help/docker.md new file mode 100644 index 00000000000..3d5f9b1fc12 --- /dev/null +++ b/home/i18n/en/docusaurus-plugin-content-docs/current/help/docker.md @@ -0,0 +1,106 @@ +--- +id: docker +title: Monitor:Docker Monitor +sidebar_label: Docker Monitor + +--- + +> Collect and monitor general performance Metrics of Docker containers. + + +## Pre-monitoring operations + +If you want to monitor the container information in `Docker`, you need to open the port according to the following steps, so that the collection request can obtain the corresponding information. + +**1. Edit the docker.server file:** + +````shell +vi /usr/lib/systemd/system/docker.service +```` + +Find the **[Service]** node, modify the ExecStart property, and add `-H tcp://0.0.0.0:2375` + +````shell +ExecStart=/usr/bin/dockerd -H fd:// --containerd=/run/containerd/containerd.sock -H tcp://0.0.0.0:2375 +```` + +This is equivalent to the **2375** port that is open to the outside world. Of course, it can be modified to other ports according to your own situation. + +**2. Reload the Docker configuration to take effect:** + +```shell +systemctl daemon-reload +systemctl restart docker +```` + +**Note: Remember to open the `2375` port number in the server console. ** + +**3. If the above method does not work:** + +Open the `2375` port number inside the server. + +```shell +firewall-cmd --zone=public --add-port=2375/tcp --permanent +firewall-cmd --reload +```` + + + + + +### Configuration parameters + +| Parameter name | Parameter help description | +| ------------ | ------------------------------- | +| Monitor Host | Monitored peer IPV4, IPV6 or domain name. Note ⚠️ without protocol headers (eg: https://, http://). | +| Monitor Name | Identifies the name of this monitor. The name needs to be unique. | +| Port | The port provided by the database externally, the default is 2375. | +| Query Timeout | Set the timeout when getting the Docker server API interface, in ms, the default is 3000 ms. | +| Container Name | Generally monitors all running container information. | +| username | connection username, optional | +| password | connection password, optional | +| URL | Database connection URL, optional, if configured, the parameters such as database name, username and password in the URL will override the parameters configured above | +| Collection Interval | Monitor periodical collection data interval, in seconds, the minimum interval that can be set is 10 seconds | +| Whether to detect | Whether to detect and check the availability of monitoring before adding monitoring, and then continue to add and modify operations if the detection is successful | +| Description Remarks | More remarks that identify and describe this monitoring, users can remark information here | + +### Collect metrics + +#### Metric collection: system + +| Metric Name | Metric Unit | Metric Help Description | +| ------------------ | -------- | ----------------------- | +| Name | None | Server Name | +| version | none | docker version number | +| os | none | server version eg: linux x86_64 | +| root_dir | none | docker folder directory eg: /var/lib/docker | +| containers | None | Total number of containers (running + not running) | +| containers_running | None | Number of running containers | +| containers_paused | none | number of containers in pause | +| images | None | The total number of container images. | +| ncpu | none | ncpu | +| mem_total | MB | Total size of memory used | +| system_time | none | system time | + +#### Metric collection: containers + +| Metric Name | Metric Unit | Metric Help Description | +| -------- | -------- | ------------ | +| id | None | The ID of the container in Docker | +| name | None | The container name in the Docker container | +| image | None | Image used by the Docker container | +| command | None | Default startup command in Docker | +| state | None | The running state of the container in Docker | +| status | None | Update time in Docker container | + +#### Metrics collection: stats + +| Metric Name | Metric Unit | Metric Help Description | +| ---------------- | -------- | ------------------ | +| name | None | The name in the Docker container | +| available_memory | MB | The amount of memory that the Docker container can utilize | +| used_memory | MB | The amount of memory already used by the Docker container | +| memory_usage | None | Memory usage of the Docker container | +| cpu_delta | None | The number of CPUs already used by the Docker container | +| number_cpus | None | The number of CPUs that the Docker container can use | +| cpu_usage | None | Docker container CPU usage | diff --git a/home/sidebars.json b/home/sidebars.json index db2143377aa..70c368ac7d0 100644 --- a/home/sidebars.json +++ b/home/sidebars.json @@ -95,6 +95,13 @@ "help/tomcat" ] }, + { + "type": "category", + "label": "云原生", + "items": [ + "help/docker" + ] + }, { "type": "category", "label": "阈值告警配置", diff --git a/manager/src/main/resources/define/app/app-docker.yml b/manager/src/main/resources/define/app/app-docker.yml new file mode 100644 index 00000000000..772cfed9e52 --- /dev/null +++ b/manager/src/main/resources/define/app/app-docker.yml @@ -0,0 +1,217 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +category: cn +app: docker +name: + zh-CN: Docker + en-US: Docker +# 参数映射map. type是参数类型: 0-number数字, 1-string明文字符串, 2-secret加密字符串 +# 强制固定必须参数 - host +configmap: + - key: host + type: 1 + - key: port + type: 0 + - key: ssl + type: 1 +metrics: + - name: system + # 指标组调度优先级(0-127)越小优先级越高,优先级低的指标组会等优先级高的指标组采集完成后才会被调度,相同优先级的指标组会并行调度采集 + # 优先级为0的指标组为可用性指标组,即它会被首先调度,采集成功才会继续调度其它指标组,采集失败则中断调度 + priority: 0 + # 指标组中的具体监控指标 + fields: + # 指标信息 包括 field名称 type字段类型:0-number数字,1-string字符串 instance是否为实例主键 unit:指标单位 + - field: name + type: 1 + - field: version + type: 1 + - field: os + type: 1 + - field: root_dir + type: 1 + - field: containers + type: 0 + - field: containers_running + type: 0 + - field: containers_paused + type: 0 + - field: containers_stopped + type: 0 + - field: images + type: 0 + - field: ncpu + type: 0 + - field: mem_total + type: 0 + unit: MB + - field: system_time + type: 1 + aliasFields: + - Name + - ServerVersion + - OperatingSystem + - OSType + - Architecture + - DockerRootDir + - Containers + - ContainersRunning + - ContainersPaused + - ContainersStopped + - Images + - NCPU + - MemTotal + - SystemTime + calculates: + - name = Name + - version = OperatingSystem + " " + ServerVersion + - os = OSType + " " + Architecture + - root_dir = DockerRootDir + - containers = Containers + - containers_running = ContainersRunning + - containers_paused = ContainersPaused + - containers_stopped = ContainersStopped + - images = Images + - ncpu = NCPU + - mem_total = MemTotal + - system_time = SystemTime + units: + - mem_total=B->MB + protocol: http + # 当protocol为http协议时具体的采集配置 + http: + # 主机host: ipv4 ipv6 域名 + host: ^_^host^_^ + # 端口 + port: ^_^port^_^ + # url请求接口路径 + url: /info + # 请求方式 GET POST PUT DELETE PATCH + method: GET + # 是否启用ssl/tls,即是http还是https,默认false + ssl: ^_^ssl^_^ + # 响应数据解析方式: default-系统规则,jsonPath-jsonPath脚本,website-api可用性指标监控 + parseType: default + + - name: containers + # 指标组调度优先级(0-127)越小优先级越高,优先级低的指标组会等优先级高的指标组采集完成后才会被调度,相同优先级的指标组会并行调度采集 + # 优先级为0的指标组为可用性指标组,即它会被首先调度,采集成功才会继续调度其它指标组,采集失败则中断调度 + priority: 1 + # 指标组中的具体监控指标 + fields: + # 指标信息 包括 field名称 type字段类型:0-number数字,1-string字符串 instance是否为实例主键 unit:指标单位 + - field: id + type: 1 + - field: name + type: 1 + - field: image + type: 1 + - field: command + type: 1 + - field: state + type: 1 + - field: status + type: 1 + aliasFields: + - Id + - $.Names[0] + - Image + - Command + - State + - Status + calculates: + - id = Id + - name=#`$.Names[0]` + - image = Image + - command = Command + - state = State + - status = Status + # 监控采集使用协议 eg: sql, ssh, http, telnet, wmi, snmp, sdk + protocol: http + # 当protocol为http协议时具体的采集配置 + http: + # 主机host: ipv4 ipv6 域名 + host: ^_^host^_^ + # 端口 + port: ^_^port^_^ + # url请求接口路径 + url: /containers/json + # 请求方式 GET POST PUT DELETE PATCH + method: GET + # 是否启用ssl/tls,即是http还是https,默认false + ssl: ^_^ssl^_^ + # 响应数据解析方式: default-系统规则,jsonPath-jsonPath脚本,website-api可用性指标监控 + parseType: jsonPath + parseScript: '$.*' + + - name: stats + priority: 2 + fields: + - field: name + type: 1 + - field: available_memory + type: 0 + unit: MB + - field: used_memory + type: 0 + unit: MB + - field: memory_usage + type: 0 + unit: '%' + - field: cpu_delta + type: 0 + - field: number_cpus + type: 0 + - field: cpu_usage + type: 0 + unit: '%' + aliasFields: + - $.name + - $.memory_stats.usage + - $.memory_stats.limit + - $.cpu_stats.cpu_usage.total_usage + - $.precpu_stats.cpu_usage.total_usage + - $.cpu_stats.online_cpus + - $.cpu_stats.system_cpu_usage + - $.precpu_stats.system_cpu_usage + calculates: + - name=$.name + - available_memory = $.memory_stats.limit + - used_memory=$.memory_stats.usage + - memory_usage=($.memory_stats.usage / $.memory_stats.limit) * 100 + - cpu_delta=$.cpu_stats.cpu_usage.total_usage - $.precpu_stats.cpu_usage.total_usage + - number_cpus=$.cpu_stats.online_cpus + - cpu_usage=(($.cpu_stats.cpu_usage.total_usage - $.precpu_stats.cpu_usage.total_usage) / ($.cpu_stats.system_cpu_usage - $.precpu_stats.system_cpu_usage)) * $.cpu_stats.online_cpus * 100 + units: + - available_memory=B->MB + - used_memory=B->MB + protocol: http + http: + # 主机host: ipv4 ipv6 域名 + host: ^_^host^_^ + # 端口 + port: ^_^port^_^ + # url请求接口路径 + url: /containers/-_-id-_-/stats + # 请求方式 GET POST PUT DELETE PATCH + method: GET + # 是否启用ssl/tls,即是http还是https,默认false + ssl: ^_^ssl^_^ + params: + stream: false + # 响应数据解析方式: default-系统规则,jsonPath-jsonPath脚本,website-api可用性指标监控 + parseType: jsonPath + parseScript: '$' \ No newline at end of file diff --git a/manager/src/main/resources/define/param/param-docker.yml b/manager/src/main/resources/define/param/param-docker.yml new file mode 100644 index 00000000000..41affb7d1a6 --- /dev/null +++ b/manager/src/main/resources/define/param/param-docker.yml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# 监控应用类型名称(与文件名保持一致) +app: docker +# 强制固定必须参数 - host(ipv4,ipv6,域名) +param: + # field-字段名称标识符 + - field: host + # name-参数字段显示名称 + name: + zh-CN: 主机Host + en-US: Host + # type-字段类型,样式(大部分映射input标签type属性) + type: host + # 是否是必输项 true-必填 false-可选 + required: true + - field: port + name: + zh-CN: 端口 + en-US: Port + type: number + # 当type为number时,用range表示范围 + range: '[0,65535]' + required: true + defaultValue: 2375 + - field: ssl + name: + zh-CN: 启动SSL + en-US: SSL + # 当type为boolean时,前端用switch展示开关 + type: boolean + required: false