Prometheus监控企业微信告警
prometheus监控并企业微信告警,只用docker-compose快速部署
docker-compose.yaml
version: '3.2'
services:
prometheus:
image: prom/prometheus
restart: "always"
ports:
- 9090:9090
container_name: "prometheus"
volumes:
- "./data/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml"
- "./data/rules:/etc/prometheus/rules"
- "./data/prometheus/data:/prometheus"
command:
- '--config.file=/etc/prometheus/prometheus.yml' #设置yml路径 跟上面挂载对应
- '--storage.tsdb.path=/prometheus' #设置数据路径 跟上面挂载对应
#告警模块
alertmanager:
image: prom/alertmanager:latest
restart: "always"
ports:
- 9093:9093
container_name: "alertmanager"
volumes:
- "./data/alert/alertmanager.yml:/etc/alertmanager/alertmanager.yml"
#web界面
grafana:
image: grafana/grafana
restart: "always"
ports:
- 3000:3000
container_name: "grafana"
volumes:
- "./data/grafana/grafana.ini:/etc/grafana/grafana.ini" #配置文件自行拷贝出来
- "./data/grafana/grafana-storage:/var/lib/grafana"
#企业微信钉钉报警
webhook-adapter:
image: guyongquan/webhook-adapter
restart: "always"
ports:
- 8060:80
container_name: "webhook-adapter"
command:
- '--adapter=/app/prometheusalert/wx.js=/wx=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=*'
# - '--adapter=/app/prometheusalert/dingtalk.js=/dingtalk=https://oapi.dingtalk.com/robot/send?access_token={token}#{secret}'
alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://172.20.57.238:8060/adapter/wx'
send_resolved: true
prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
- static_configs:
- targets:
- 172.20.57.238:9093
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: 'myself'
static_configs:
- targets: ['172.20.54.113:9114']
test1_rules.yml
groups:
- name: MemoryUsage
rules:
- alert: HighMemoryUsage
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 #内存容量不足10%
for: 5m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} MEM usgae high"
description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})"
- name: Instance
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- name: 根分区剩余空间
rules:
- alert: 根分区剩余空间
expr: node_filesystem_avail_bytes{job="node-exporter",mountpoint="/"} / node_filesystem_size_bytes{job="node-exporter",mountpoint="/"} * 100 < 10
for: 3m
labels:
severity: warning
annotations:
description: ' {{ $labels.instance }} 节点 {{ $labels.device }} 根分区文件系统剩余空间: {{ printf "%.2f" $value }}% '
summary: '根分区剩余空间不足 10%'
- name: /mnt/aps所在磁盘空间
rules:
- alert: /mnt/aps所在磁盘空间剩余空间
expr: 100 * (1 - node_filesystem_avail_bytes{mountpoint="/mnt/aps"} / node_filesystem_size_bytes{mountpoint="/mnt/aps"}) > 90
for: 3m
labels:
severity: critical
annotations:
summary: "High disk usage detected"
description: "/mnt/aps所在磁盘空间剩余空间不足 10%"
- name: 主机CPU使用率告警
rules:
- alert: 主机CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "CPU近5分钟使用率大于90%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
- name: 主机iowait较高
rules:
- alert: 主机iowait较高
expr: (sum(increase(node_cpu_seconds_total{mode='iowait'}[5m]))by(instance)) / (sum(increase(node_cpu_seconds_total[5m]))by(instance)) *100 >= 90
for: 5m
labels:
severity: warning
annotations:
summary: "CPU ioWait近5分钟占比大于等于90%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
- name: 主机磁盘读过大
rules:
- alert: 主机磁盘读过大
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 180 * 1024 * 1024
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘读过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
- name: 主机磁盘写过大
rules:
- alert: 主机磁盘写过大
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 180 * 1024 * 1024
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘写过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
- name: TCP连接数
rules:
- alert: TCPTimeWait数量过多告警
expr: sum by(instance) (rate(node_sockstat_TCP_tw[5m])) >= 3800
for: 3m
labels:
severity: warning
annotations:
summary: "TCP TimeWait数量大于3800, 实例: {{$labels.instance}},当前值: {{ $value }}%"
- name: Pod 监控
rules:
- alert: Pod重启次数过多
expr: sum by (instance,namespace,pod) (increase(kube_pod_container_status_restarts_total[1h])) > 15
for: 1h
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} 连续重启"
description: "实例:{{$labels.job}}的{{$labels.instance}}命名空间{{$labels.namespace}}下的Pod {{ $labels.pod }} 在过去1小时内重启次数超过15次,请及时处理。"
- alert: Pod状态异常
expr: kube_pod_status_phase{phase="Pending",instance !~ "172.20.53.122:.*"} == 1 or kube_pod_status_phase{phase="Failed",instance !~ "172.20.53.122:.*",instance !~ "172.20.54.112:.*"} == 1
for: 1h
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} 状态异常"
description: "实例:{{$labels.job}}的{{$labels.instance}}命名空间{{$labels.namespace}}下的Pod {{ $labels.pod }} 当前状态异常,请及时处理。"
目录结构
prometheus/
├── data
│ ├── alert
│ │ └── alertmanager.yml
│ ├── grafana
│ │ ├── grafana.ini
│ │ └── grafana-storage
│ │ ├── alerting
│ │ │ └── 1
│ │ │ └── __default__.tmpl
│ │ ├── csv
│ │ ├── grafana.db
│ │ ├── plugins
│ │ └── png
│ ├── prometheus
│ │ ├── data
│ │ │ ├── 01H78F28QM9WK74NRM1M8JHST8
│ │ │ │ ├── chunks
│ │ │ │ │ ├── 000001
│ │ │ │ │ └── 000002
│ │ │ │ ├── index
│ │ │ │ ├── meta.json
│ │ │ │ └── tombstones
│ │ │ ├── 01H793NE7B147SQC3G732RTMEV
│ │ │ │ ├── chunks
│ │ │ │ │ └── 000001
│ │ │ │ ├── index
│ │ │ │ ├── meta.json
│ │ │ │ └── tombstones
│ │ │ ├── 01H79HCJ0HGJM838GVTP49M435
│ │ │ │ ├── chunks
│ │ │ │ │ └── 000001
│ │ │ │ ├── index
│ │ │ │ ├── meta.json
│ │ │ │ └── tombstones
│ │ │ ├── 01H79HCXG5Z793VZQD75PD3XGR
│ │ │ │ ├── chunks
│ │ │ │ │ └── 000001
│ │ │ │ ├── index
│ │ │ │ ├── meta.json
│ │ │ │ └── tombstones
│ │ │ ├── chunks_head
│ │ │ │ ├── 000025
│ │ │ │ ├── 000026
│ │ │ │ └── 000027
│ │ │ ├── lock
│ │ │ ├── queries.active
│ │ │ └── wal
│ │ │ ├── 00000081
│ │ │ ├── 00000082
│ │ │ ├── 00000083
│ │ │ ├── 00000084
│ │ │ ├── 00000085
│ │ │ └── checkpoint.00000080
│ │ │ └── 00000000
│ │ └── prometheus.yml
│ └── rules
│ └── test1_rules.yml
├── docker-compose.yaml
└── test_alert.sh
参考链接
https://blog.51cto.com/erdong/4755694