【优化篇】telegraf+shell脚本实现秒级的服务状态异常监测与告警
一、场景
修改成一个检测GrayLog5.1版本opensearch状态的脚本
脚本如下
vim /opt/check_opensearch_status.sh
LOCK_FILE=/var/log/opensearch_record.log | |
# 钉钉机器人 Webhook URL | |
WEBHOOK_URL="https://oapi.dingtalk.com/robot/send?access_token=XXXXXXXXXXXXXXXXXXXXXXX" | |
# 记录异常状态的文件路径 | |
STATUS_FILE="/tmp/opensearch_status.txt" | |
NORMAL_STATUS_FILE="/opt/opensearch_normal_status.txt" | |
#echo "Active: active (running)" > /opt/opensearch_normal_status.txt | |
# 获取当前时间 | |
current_time=$(date +"%Y-%m-%d %H:%M:%S") | |
# 获取当前 opensearch 状态 | |
current_status=$(/usr/bin/systemctl status opensearch | grep Active | awk -F "since" '{print $1}' | sed 's/^ *//;s/ *$//') | |
# 读取上次记录的状态 | |
previous_status=$(cat "$STATUS_FILE") | |
# 正常opensearch 状态 | |
#Active: active (running) | |
normal_status=$(cat "$NORMAL_STATUS_FILE") | |
# 判断当前状态是否与上次记录的状态不一致 | |
if [[ "$current_status" == "$normal_status" && "$previous_status" != "$normal_status" ]]; then | |
# 发送恢复正常告警到钉钉机器人 | |
echo "--------------------------------" >> ${LOCK_FILE} 2>&1 | |
echo `date +"%Y-%m-%d %H:%M:%S"` >> ${LOCK_FILE} 2>&1 | |
echo "服务状态已恢复运行,发送dingding告警" >> ${LOCK_FILE} 2>&1 | |
recovery_message="【告警通知】:$(hostname -I)的opensearch 状态已恢复正常。\n\n【时间】:$current_time \n\n【opensearch状态】:<font color=#67C23A>$current_status</font>" | |
echo $recovery_message >> ${LOCK_FILE} 2>&1 | |
curl -s -H "Content-Type: application/json" -d "{\"msgtype\":\"markdown\",\"markdown\":{\"title\":\"告警通知\",\"text\":\"$recovery_message\"}}" "$WEBHOOK_URL" | |
# 将当前状态记录到文件 | |
echo "$current_status" > "$STATUS_FILE" | |
echo "--------------------------------" >> ${LOCK_FILE} 2>&1 | |
echo "$current_status" | |
elif [[ "$current_status" != "$previous_status" ]]; then | |
# 发送异常告警到钉钉机器人 | |
echo "--------------------------------" >> ${LOCK_FILE} 2>&1 | |
echo `date +"%Y-%m-%d %H:%M:%S"` >> ${LOCK_FILE} 2>&1 | |
echo "服务状态异常,发送dingding告警" >> ${LOCK_FILE} 2>&1 | |
alert_message="【告警通知】:$(hostname -I)的opensearch 状态异常告警!\n\n【时间】:$current_time \n\n 【opensearch状态】:<font color=#FF0000>$current_status</font>" | |
echo $alert_message >> ${LOCK_FILE} 2>&1 | |
curl -s -H "Content-Type: application/json" -d "{\"msgtype\":\"markdown\",\"markdown\":{\"title\":\"告警通知\",\"text\":\"$alert_message\"}}" "$WEBHOOK_URL" | |
# 将当前状态记录到文件 | |
echo "$current_status" > "$STATUS_FILE" | |
echo "--------------------------------" >> ${LOCK_FILE} 2>&1 | |
echo "$current_status" | |
fi |
需求:
原由于Crontab定时任务最多能实现1分钟执行一次的频率
(图片点击放大查看)
如果在1分钟内服务重启后又恢复时这种情况很大的可能是无法监测到状态异常变化的情况
下面借助telegraf的inputs.exec模块实现秒级的状态监测
具体操作如下
1、下载telegraf的rpm包并安装
https://github.com/influxdata/telegraf/releases | |
rpm -ivh telegraf-1.27.3-1.x86_64.rpm |
(图片点击放大查看)
2、生成配置文件样本并替换修改
telegraf --sample-config --input-filter exec --output-filter graylog > telegraf.conf | |
mv /etc/telegraf/telegraf.conf /etc/telegraf/telegraf.conf_default | |
mv ./telegraf.conf /etc/telegraf/telegraf.conf | |
vim /etc/telegraf/telegraf.conf |
(图片点击放大查看)
(图片点击放大查看)
最终的配置文件如下
[root@centos opt]# cat /etc/telegraf/telegraf.conf | grep -v ^# | grep -v ^$ | grep -v ^.*## | grep -v ^.*# | |
[global_tags] | |
[agent] | |
interval = "10s" | |
round_interval = true | |
metric_batch_size = 1000 | |
metric_buffer_limit = 10000 | |
collection_jitter = "0s" | |
flush_interval = "10s" | |
flush_jitter = "0s" | |
precision = "0s" | |
hostname = "" | |
omit_hostname = false | |
[[outputs.graylog]] | |
servers = ["udp://127.0.0.1:12201"] | |
[[inputs.exec]] | |
commands = ["/opt/check_opensearch_status.sh"] | |
timeout = "5s" | |
name_override = "sshd_service_status_check" | |
data_format = "value" | |
data_type = "string" | |
interval = "10s" |
(图片点击放大查看)
3、测试
记得将/tmp/opensearch_status.txt 状态记录文件设置成可写
并且执行
echo "Active: active (running)" > /opt/opensearch_normal_status.txt
chmod 777 /tmp/opensearch_status.txt | |
systemctl stop opensearch.service | |
systemctl start opensearch.service |
(图片点击放大查看)
效果如下
(图片点击放大查看)
上面的interval = "10s"为10秒,也就是10秒执行一次状态监测脚本,当然你可以设置得更低。例如5s