原生节点 POD GPU 监控
如何查看原生节点 POD 基础监控?
前置条件:
TKE 提供 elastic-gpu-exporter 组件用于获取 GPU 相关监控指标,需要先安装该组件采集GPU 指标数据,参考基础监控指标采集:容器服务 GPU 监控指标获取-TKE 标准集群指南-文档中心-腾讯云
Pod 中使用了gpu 资源基础监控面板才会有 gpu的 tab,否则将不会显示, 如下图:
超级节点 POD GPU 监控
如何查看超级节点 POD 基础监控?
超级节点pod 的 GPU 监控是默认显示的,可以直接看,如下图:
配置 GPU 指标的 HPA
使用场景
在业务使用 GPU 过程中,当业务处理量变大时,可能因 GPU 资源不足而导致业务出错,如下:
error:CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 46.81 MiB is free. Process 1075 has 14.70 GiB memory in use. Of the allocated memory 12.27 GiB is allocated by PyTorch, and 1.25 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
如何使用
可以根据 GPU 指标项通过 YAML 来配置 HPA,保障业务稳定运行。 譬如针对 GPU 资源的使用情况配置 HPA,相关配置示例如下:
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
metadata:
name: example
namespace: default
labels:
qcloud-app: example
spec:
minReplicas: 1
maxReplicas: 2
metrics:
- type: Pods # 支持使用 Resource
pods:
metric:
name: k8s_pod_gpu_memory_used_bytes # GPU 内存使用量
target:
averageValue: "10240" # 10240 MiB
type: AverageValue
scaleTargetRef:
apiVersion: apps/v1beta2
kind: Deployment
name: nginx
更多指标参考HPA指标:容器服务 自动伸缩指标说明-TKE 标准集群指南-文档中心-腾讯云
配置 Grafana 面板
导入Grafana 面板数据
可以直接导入 TKE 整理的监控面板文件,如下:
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": null,
"graphTooltip": 0,
"id": 45,
"iteration": 1663666644725,
"links": [],
"liveNow": false,
"panels": [
{
"cacheTimeout": null,
"datasource": "${datasource}",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 5,
"x": 1,
"y": 0
},
"id": 12,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"text": {}
},
"pluginVersion": "8.2.7",
"targets": [
{
"exemplar": true,
"expr": "(pod_nvidia_gpu_memory_used_mib{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}/pod_nvidia_gpu_memory_total_mib{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})*100",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Utilization",
"transparent": true,
"type": "gauge"
},
{
"cacheTimeout": null,
"datasource": "${datasource}",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 5,
"x": 6,
"y": 0
},
"id": 14,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"text": {}
},
"pluginVersion": "8.2.7",
"targets": [
{
"exemplar": true,
"expr": "pod_nvidia_gpu_duty_cycle{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Utilization",
"transparent": true,
"type": "gauge"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prom-16fnzvzp",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 0
},
"hiddenSeries": false,
"id": 10,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.2.7",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:1468"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "pod_nvidia_gpu_duty_cycle{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Utilization / Card",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:73",
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:74",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${datasource}",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 11,
"x": 0,
"y": 7
},
"hiddenSeries": false,
"id": 8,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.2.7",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"$$hashKey": "object:859"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "(pod_nvidia_gpu_memory_used_mib{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}/pod_nvidia_gpu_memory_total_mib{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})*100",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Memory Utilization / Card",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:229",
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:230",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${datasource}",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 7
},
"hiddenSeries": false,
"id": 2,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.2.7",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "pod_nvidia_gpu_memory_used_mib{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Memory Usage/ Card",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:308",
"format": "mbytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:309",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${datasource}",
"fill": 4,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 16
},
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.2.7",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "pod_nvidia_gpu_power_usage_watts{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Power Usage",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:701",
"format": "watt",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:702",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": false,
"schemaVersion": 32,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": null,
"datasource": "${datasource}",
"definition": "label_values(pod_nvidia_gpu_num_devices, cluster)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(pod_nvidia_gpu_num_devices, cluster)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": null,
"datasource": "${datasource}",
"definition": "label_values(pod_nvidia_gpu_num_devices{cluster=\"$cluster\"}, namespace)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(pod_nvidia_gpu_num_devices{cluster=\"$cluster\"}, namespace)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": null,
"datasource": "${datasource}",
"definition": "label_values(pod_nvidia_gpu_num_devices{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "pod",
"options": [],
"query": {
"query": "label_values(pod_nvidia_gpu_num_devices{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "GPU / pod-eks",
"uid": "37tllT74z",
"version": 5
}