docker方式部署prometheus监控告警

2023-12-21 08:19:53

1. docker-compose环境

本案例需要使用docker-compose,若没有环境需下载docker-compose,下载地址:GitHub - docker/compose: Define and run multi-container applications with Docker

本例使用v2.23.3,如果网络环境不好建议从github上下载好再上传到服务器

[root@node ~]# wget https://github.com/docker/compose/releases/download/v2.23.3/docker-compose-linux-x86_64
[root@node ~]# cp docker-compose-linux-x86_64 /usr/local/bin/docker-compose
[root@node ~]# chmod +x /usr/local/bin/docker-compose
[root@node ~]# docker-compose --version

2. 文件目录

按照以下路径创建目录和文件

/home/docker-prometheus/
├── alertmanager
│   ├── alertmanager.yml
│   └── template
│       └── wechat.tmpl
├── docker-compose.yml
├── grafana
└── prometheus
    ├── prometheus.yml
    ├── rules
    │   └── alerts.yml
    └── sd_config
        ├── linux.yml
        ├── snmp.yml
        └── windows.yml

3. alertmanager.yml

global:
  resolve_timeout: 15s

route:
  group_by: ['env','instance','type','group','job','alertname']
  group_wait: 15s        # 当收到告警的时候,等待15秒看是否还有告警,如果有就一起发出去
  group_interval: 15s    # 发送警告间隔时间
  repeat_interval: 30s   # 重复报警的间隔时间
  receiver: 'wechat'
 
receivers:
- name: 'wechat'
  webhook_configs:
  - url: 'http://192.168.32.146:8089/adapter/wx'
    send_resolved: true
 
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['env','instance','type','group','job','alertname']

4. wechat.tmpl

{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
========= 监控报警 =========
告警状态:{{   .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ $alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
触发阀值:{{ .Annotations.value }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end =  =========
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
========= 异常恢复 =========
告警类型:{{ .Labels.alertname }}
告警状态:{{   .Status }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
========= = end =  =========
{{- end }}
{{- end }}
{{- end }}
{{- end }}

5. docker-compose.yml

version: '3.3'
services:
   prometheus:
     container_name: prometheus
     image: prom/prometheus
     ports:
       - "9090:9090"
     volumes:
       - /home/docker-prometheus/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
       - /home/docker-prometheus/prometheus/rules/alerts.yml:/etc/prometheus/rules/alerts.yml
       - /home/docker-prometheus/prometheus/sd_config/:/etc/prometheus/sd_config/
       - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
     restart: always  
     networks: 
       prometheus_net: 

   grafana:
     image: grafana/grafana
     container_name: grafana
     ports:
       - "3000:3000"
     environment:
      - "GF_SECURITY_ADMIN_PASSWORD=123456Aa"
      - "GF_INSTALL_PLUGINS=alexanderzobnin-zabbix-app"
     privileged: true    # Permission denied时加
     volumes:
       - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
       - /home/docker-prometheus/grafana/data:/var/lib/grafana  
       - /home/docker-prometheus/grafana/grafana.ini:/etc/grafana/grafana.ini
     restart: always
     networks: 
       prometheus_net:

   alertmanager:
     image: prom/alertmanager
     container_name: alertmanager
     ports:
       - '9093:9093'
     volumes:
       - /home/docker-prometheus/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
       - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
       - /home/docker-prometheus/alertmanager/template/wechat.tmpl:/etc/alertmanager/wechat.tmpl
     restart: always
     networks: 
       prometheus_net:


   webhook-adapter:
     image: guyongquan/webhook-adapter:latest
     container_name: webhook-adapter
     hostname: webhook-adapter
     ports:
       - "8089:80"
     restart: always
     command:
       - "--adapter=/app/prometheusalert/wx.js=/wx= # webhook地址
     networks: 
       prometheus_net:

networks:
  prometheus_net:

6. prometheus.yml

global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.



alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 192.168.32.146:9093

rule_files:
    - "/etc/prometheus/rules/*.yml"
scrape_configs:
  - job_name: 'linux'
    file_sd_configs:
    - files:
      - '/etc/prometheus/sd_config/linux.yml'
      refresh_interval: 30m  # 每隔30分钟检查一次

  - job_name: 'windows'
    file_sd_configs:
    - files:
      - '/etc/prometheus/sd_config/windows.yml'
      refresh_interval: 30m  

# snmp监控交换机
  - job_name: 'snmp'
    file_sd_configs:
    - files:
      - '/etc/prometheus/sd_config/snmp.yml'
      refresh_interval: 30m  
    metrics_path: /snmp
    params:
      module: [HUAWEI]  
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 192.168.32.146:9116  

7. alerts.yml

groups:
- name: general.rules
  rules:
  - alert: 主机宕机
    expr: up == 0
    for: 15s
    labels:
      serverity: error
    annotations:
      summary: "主机 {{ $labels.instance }} 主机宕机"
      description: "{{ $labels.instance }} job {{ $labels.job }} 实例未在线!"

- name: alters.rules
  rules:
  - alert: 内存使用率
    expr: 100 - round(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100) > 80
    for: 1m
    labels:
       # severity: 指定告警级别。有三种等级,分别为warning、critical和emergency。严重等级依次递增。
      severity: emergency
    annotations:
      # summary描述告警的概要信息
      # description用于描述告警的详细信息。
      summary: "主机 {{ $labels.instance }} 内存使用率过高"
      description: "{{ $labels.instance }} 内存使用大于 80% (当前值: {{ $value }})"

- name : CPU.rules
  rules:
  - alert: CPU Usage
    expr: 100 - round(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
    for: 1m
    labels:
      severity: error
    annotations:
      summary: "实例 {{ $labels.instance }} CPU使用率过高"
      description: "实例CPU使用率超过 80% (当前值为: {{ $value }}%)"
      ip: "{{ $labels.ip }}"

- name: Disk.rules
  rules:
    - alert: Disk Usage
      expr: 100 - round(node_filesystem_free_bytes{fstype=~"ext3|ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}* 100) > 80
      for: 1m
      labels:
        severity: error
      annotations:
        summary: "实例 {{ $labels.instance }} 磁盘使用率过高"
        description: "实例磁盘使用率超过 80% (当前值为: {{ $value }}%)"
        ip: "{{ $labels.ip }}"

8. linux.yml

- targets:
  - "192.168.xx.xx:9100"
  - "...:9100"
  - "...:9100"
  - "...:9100"
  - "..:9100"
  - "...:9100"

9. 运行

docker-compose up -d # 在docker-compose.yml文件目录下用此命令
docker-compose -f /home/docker-prometheus/docker-compose.yml up -d # 其他目录下用此命令
[root@gt-32 docker-prometheus]# docker-compose ps

若发现grafana状态没有up可通过docker logs -f grafana追踪日志,一般情况为目录没有权限,提示如下:

mkdir: can't create directory '/var/lib/grafana/plugins': Permission denied
GF_PATHS_DATA='/var/lib/grafana' is not writable.

解决办法:向grafana目录下的data目录加权限可以解决

[root@gt-32 docker-prometheus]# chmod +x grafana/data/

文章来源:https://blog.csdn.net/m0_60169980/article/details/135058063
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。