文章详情

短信预约-IT技能 免费直播动态提醒

请输入下面的图形验证码

提交验证

短信预约提醒成功

prometheus监控服务器、网络设备、Vmware-esxi

2023-10-24 05:10

关注

采用prometheus方式进行部署,监控本地服务器、网络线路、域名访问达到阈值触发告警,从而快速定为问题源加快响应速度

1、创建所需系统结构目录并给目录授权

2、prometheus部署

1)创建docker-compose文件

vim docker-compose.yml

-------------------------------------包含部署grafana、consul、alertmanager----------------------------------

version: '3.7'

services:

prometheus:
depends_on:
- alertmanager
image: prom/prometheus:latest
restart: always
container_name: prometheus
environment:
- TZ=CST-8
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./rules:/etc/prometheus/rules/
- ./prometheus/data:/prometheus
- /etc/hosts:/etc/hosts
- /etc/localtime:/etc/localtime
ports:
- "9090:9090"
networks:
- prom

alertmanager:
image: prom/alertmanager:latest
restart: always
container_name: alertmanager
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ./alertmanager/mail.tmpl:/etc/alertmanager/mail.tmpl
- /etc/localtime:/etc/localtime
environment:
TZ: Asia/Shanghai
ports:
- "9093:9093"
- "9094:9094"
networks:
- prom

grafana:
depends_on:
- prometheus
container_name: grafana
image: grafana/grafana:latest
restart: always
volumes:
- ./grafana:/var/lib/grafana
- /etc/localtime:/etc/localtime
ports:
- "3000:3000"

blackbox_exporter:
image: quay.io/prometheus/blackbox-exporter:latest
restart: always
container_name: blackbox_exporter
volumes:
- /etc/localtime:/etc/localtime
- ./blackbox_exporter/blackbox.yml:/config/blackbox.yml
ports:
- "9115:9115"
command: --config.file=/config/blackbox.yml
networks:
- prom

vmware_exporter:
image:  pryorda/vmware_exporter
restart: always
container_name: vmware_exporter
env_file:
- ./vm_config/vm_config.env
ports:
- "9273:9272"
networks:
- prom

nginx:
image: nginx:latest
restart: always
container_name: nginx
volumes:
- /usr/share/zoneinfo/PRC:/etc/localtime
- ./nginx/html:/usr/share/nginx/html
- ./nginx/conf.d:/etc/nginx/conf.d
- ./nginx/image:/image
ports:
- "80:80"
networks:
- prom

vmware_exporter2:
image:  pryorda/vmware_exporter
restart: always
container_name: vmware_exporter2
env_file:
- ./vm_config/vm2_config.env
ports:
- "9272:9272"
networks:
- prom

vmware_exporter-jk:
image:  pryorda/vmware_exporter
restart: always
container_name: vmware_exporter-jk
env_file:
- ./vm_config/vm3_config.env
ports:
- "9274:9272"
networks:
- prom

snmp-AD:
image: prom/snmp-exporter
restart: always
container_name: snmp_exporter-AD
volumes:
- ./snmp_exporter/snmp_exporter-AD/snmp_exporter/generator/snmp.yml:/etc/snmp_exporter/snmp.yml
ports:
- 9116:9116
command: --config.file=/etc/snmp_exporter/snmp.yml
networks:
- prom

snmp-AD-2:
image: prom/snmp-exporter
restart: always
container_name: snmp_exporter-AD-2
ports:
- "9117:9116"
command: --config.file=/etc/snmp_exporter/snmp.yml
networks:
- prom

networks:
prom:
driver: bridge

-------------------------------------------------------------------------------------------------------------------

2)配置prometheus配置文件

vim prometheus.yml

global:

scrape_interval: 15s

evaluation_interval: 15s

rule_files:

- '/etc/prometheus/rules/ad-alert/*.yml'

- '/etc/prometheus/rules/https-alert/*.yml'

- '/etc/prometheus/rules/https-duration/*.yml'

- '/etc/prometheus/rules/node-alert/*.yml'

alerting:

alertmanagers:

- static_configs:

- targets:

- 'X.X.X.X:9093'

scrape_configs:

- job_name: 'prometheus'

static_configs:

- targets:

- 'X.X.X.X:9090'

- job_name: 'vm-exporter'

static_configs:

- targets:

- 'X.X.X.X:9272' #本地物理机监控

- 'X.X.X.X:9273'

- 'X.X.X.X:9274'

- job_name: 'federate' #联邦集群

metrics_path: '/federate'

honor_labels: true

params:

'match[]':

- '{job="prometheus"}'

- '{__name__=~".*"}'

static_configs:

- targets:

- 'X.X.X.X:9090'

- job_name: 'consul-node-exporter' #consul自动注册

metrics_path: /metrics

scheme: http

scrape_interval: 15s

scrape_timeout: 5s

consul_sd_configs:

- server: 'X.X.X.X:8500'

refresh_interval: 30s

services: ['node-exporter']

relabel_configs:

- source_labels: [__meta_consul_tags]

regex: .*,instance=([^,]*).*

target_label: instance

- source_labels: [__meta_consul_service_address]

target_label: 'ipaddress'

- source_labels: [__meta_consul_service_id]

target_label: 'hostname'

- source_labels: [__meta_consul_service_metadata_group]

target_label: 'localhost'

- source_labels: [__meta_consul_service_metadata_environment]

target_label: 'environment'

- source_labels: [__meta_consul_service_metadata_Project]

target_label: 'Project'

- source_labels: [__meta_consul_service]

target_label: 'service'

- job_name: 'snmp' #本地网络设备监控(自定义监控项)

metrics_path: /snmp

params:

module: [sangfor]

relabel_configs:

- source_labels: [__address__]

target_label: __param_target

- source_labels: [__param_target]

target_label: instance

- target_label: __address__

replacement: X.X.X.X:9116

static_configs:

- targets:

- '1x.x.x.x'               #被监控网络设备 地址

labels:

hostname: AD

group: snmp

scrape_interval: 30s

scrape_timeout: 30s

- job_name: 'snmp-1' #本地网络设备监控

metrics_path: /snmp

params:

module: [if_mib]

relabel_configs:

- source_labels: [__address__]

target_label: __param_target

- source_labels: [__param_target]

target_label: instance

- target_label: __address__

replacement: X.X.X.X:9117

static_configs:

- targets:

- 'x.x.x.x'          #被监控网络设备 地址

labels:

hostname: AD-2

group: snmp

scrape_interval: 30s

scrape_timeout: 30s

- job_name: 'blackbox' #黑盒监控-域名质量访问监控

metrics_path: /probe

params:

module: [http_2xx] # Look for an HTTP 200 response.

relabel_configs:

- source_labels: [__address__]

target_label: __param_target

- source_labels: [__param_target]

target_label: instance

- target_label: __address__

replacement: X.X.X.X:9115 # The blackbox exporter's real hostname:port.

static_configs:

- targets:

- 'https://www.baidu.com'

- 'https://www.google.com'

- 'https://www.github.com'

- 'https://www.youtube.com'

- 'https://activity.huaweicloud.com'

- 'https://www.aliyun.com'

- 'https://cloud.tencent.com'

- 'https://www.tapd.cn'

- 'https://www.openai.com'

- 'https://www.pinterest.com'

- 'https://www.qq.com'

- 'https://www.bilibili.com'

3、添加告警rules

1)服务器告警:

groups:

2)域名探测延迟告警

groups:

3)域名连接告警

groups:

4)线路状态告警、网络流量线路告警

groups:

--------------------------------------------------------------------------------------------------------------------------------------------------------------------

groups:

4、配置alertmanager

1)配置alertmanager配置文件 vim alertmanager.yml

global:
resolve_timeout: 5m
smtp_smarthost: smtp.163.com:25
smtp_from: X.X.X.X@163.com
smtp_auth_username: X.X.X.X@163.com
smtp_auth_password: X.X.X.X
smtp_require_tls: false

templates:

route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 6h
receiver: 'email'
routes:
- match:
severity: 'critical'      #线路掉线告警、宕机告警
receiver: 'webhook-critical'
- match:
severity: 'critical'
receiver: 'email'
- match:
secerity: 'P1,P2'         #服务告警
receiver: 'email'
- match:
secerity: 'warning'       #线路告警
receiver: 'https-alert'
receivers:

inhibit_rules:

2)配置告警模板 vimmail.tmpl (网络线路告警和机器告警不使用同一个告警模板)

{{ define "email.to.html" }}

{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}

==========异常告警==========

告警程序: prometheus_alert

告警级别: {{ .Labels.severity }} 级

告警类型: {{ .Labels.alertname }}

故障主机: {{ .Labels.instance }}

告警主题: {{ .Annotations.summary }}

告警详情: {{ .Annotations.description }}

触发时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}

{{ end }}{{ end -}}

{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}

==========异常恢复==========

告警程序: prometheus_alert

故障主机: {{ .Labels.instance }}

故障主题: {{ .Annotations.summary }}

告警详情: {{ .Annotations.description }}

告警时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}

恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}

{{ end }}{{ end -}}

{{- end }}

{{ define "email03.to.html" }}

{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}

==========异常告警==========

告警程序: prometheus_alert

告警级别: {{ .Labels.severity }} 级

告警主题: {{ .Annotations.summary }}

告警详情: {{ .Annotations.description }}

触发时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}

{{ end }}{{ end -}}

{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}

==========异常恢复==========

告警程序: prometheus_alert

告警级别: {{ .Labels.severity }} 级

故障主题: {{ .Annotations.summary }}

告警详情: {{ .Annotations.description }}

告警时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}

恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}

{{ end }}{{ end -}}

{{- end }}

5、配置成consul

1)采用分离部署,consul单独写一个docker-compose.yml,集群部署方式

2)vim docker-compose.yml

version: '3.7'
services:
consul1:
image: hashicorp/consul:latest
container_name: consul1
restart: always
command: agent -server -client=0.0.0.0 -bootstrap-expect=3 -node=consul1
volumes:
- ./consul1/data:/consul/data
- ./consul1/config:/consul/config
networks:
- prom
consul2:
image: hashicorp/consul:latest
container_name: consul2
restart: always
command: agent -server -client=0.0.0.0 -retry-join=consul1 -node=consul2
volumes:
- ./consul2/data:/consul/data
- ./consul2/config:/consul/config
networks:
- prom
consul3:
image: hashicorp/consul:latest
container_name: consul3
restart: always
command: agent -server -client=0.0.0.0 -retry-join=consul1 -node=consul3
volumes:
- ./consul3/data:/consul/data
- ./consul3/config:/consul/config
networks:
- prom
consul4:
image: hashicorp/consul:latest
container_name: consul4
restart: always
ports:
- 8500:8500
command: agent -client=0.0.0.0 -retry-join=consul1 -ui -node=client1
volumes:
- ./consul4/data:/consul/data
- ./consul4/config:/consul/config
networks:
- prom

networks:
prom:
driver: bridge

3) 配置自动注册脚本、hosts注册主机信息(注意这两个文件必须放在同一目录下)

[root@prometheus-2 consul]# cat hosts

prometheus X.X.X.X

prometheus-test X.X.X.X

test-nginx X.X.X.X

snipeit X.X.X.X

prometheus-2 X.X.X.X

--------------------------------------------------------------------------------------------

[root@prometheus-2 consul]# cat linux-node.sh

#!/bin/bash

CONSUL_SERVER="X.X.X.X"

while read -r host_name host_addr

do

payload='{"id": "'"$host_addr"'","name": "node-exporter","address": "'"$host_addr"'","port":9100,"tags": ["linux-node", "instance='"$host_name"'"],"checks": [{"http": "http://'"$host_addr"':9100/","interval": "15s"}]}'

curl -X PUT -d "$payload" "http://$CONSUL_SERVER:8500/v1/agent/service/register"

done < hosts

-------------------------------------------------------------------------------------------

6、配置本地实体物理机

1)进入vm_config目录(注意,每台实体机监控都需要配置一个地址账号密码环境变量)

[root@prometheus-2 vm_config]# ls

vm2_config.env vm3_config.env vm_config.env

[root@prometheus-2 vm_config]# cat *

VSPHERE_USER=root

VSPHERE_PASSWORD=x.x.x.x

VSPHERE_HOST=x.x.x.x

VSPHERE_IGNORE_SSL=TRUE

VSPHERE_SPECS_SIZE=2000

----------------------------------------------------------------------------------------------------------

VSPHERE_USER=root

VSPHERE_PASSWORD=x.x.x.x

VSPHERE_HOST=x.x.x.x

VSPHERE_IGNORE_SSL=TRUE

VSPHERE_SPECS_SIZE=2000

----------------------------------------------------------------------------------------------------------

VSPHERE_USER=root

VSPHERE_PASSWORD=x.x.x.x

VSPHERE_HOST=x.x.x.x

VSPHERE_IGNORE_SSL=TRUE

VSPHERE_SPECS_SIZE=2000

----------------------------------------------------------------------------------------------------------

7、配置blackbox_exporter

1)拉取github项目 wget clone https://github.com/prometheus/blackbox_exporter.git

2)修改blackbox.yml配置文件 vim blackbox.yml

modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
grpc:
prober: grpc
grpc:
tls: true
preferred_ip_protocol: "ip4"
grpc_plain:
prober: grpc
grpc:
tls: false
service: "service1"
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: ":[ ]+ 001"
icmp:
prober: icmp
icmp_ttl5:
prober: icmp
timeout: 5s
icmp:
ttl: 5

8、配置snmp_exporter

1)拉取github项目wget clone https://github.com/prometheus/snmp_exporter.git

2) 确保系统有GO环境、使用生成器生成配置信息

cd snmp_exporter/generator

make generator mibs

make generate

3)在mibs文件夹中添加被监控机器的mib文件信息

4)配置generator.yml文件

modules:
sangfor:
walk:
- sfSysDevName         # 系统主机名称
- adStandByState       # 双机主备状态
- sfCpuLoadLast1Min    # cpu过去1分钟的平均负载
- sfCpuLoadLast5Min    # cpu过去5分钟的平均负载
- sfCpuLoadLast15Min   # cpu过去15分钟的平均负载
- sfCpuTemp            # cpu温度
- sfSysTotalMemory     # 内存总大小(KB)
- sfSysFreeMemory      # 内存可用大小(KB)
- sfDiskSize           # 磁盘大小(M)
- sfFilesystemName     # 磁盘分区名称
- sfDiskUsed           # 磁盘使用的空间(M)
- sfDiskAvail          # 磁盘剩余空间(M)
- sfDiskUsedPercent    # 磁盘使用率(%)
- sfDeviceStatus       # 磁盘状态
- sfFanName            # 风扇名称
- sfFanSpeed           # 风扇转速
- sfFanState           # 风扇状态
- sfPowerState         # 电源状态
- adConns              # 系统并发连接数
- adNewConns           # 系统新建连接数
- adVsConns            # 所有虚拟服务并发连接数
- adVsNewConns         # 所有虚拟服务新建连接数
- adUplinkThroughput   # 所有链路上行流量(整型)
- adDownlinkThroughput # 所有链路下行流量 (整型)
- adMemCostRate        # 内存使用率
- adHttpRequest        # 当前设备http请求速率
- adVsNumber           # 虚拟服务数量
- adPoolNumber         # 节点池数量
- adNodeNumber         # 节点数量
- adLinkName           # 链路名称
- adLinkType           # 链路类型
- adLinkIfName         # 链路引用的网口
- adLinkStatus         # 链路状态,0为离线,1为正常,2为繁忙
- adLinkBitIn          # 链路上行流量
- adLinkBitOut         # 链路下行流量
- adLinkNumber         # 设备链路个数
- adCpuCostRate        # CPU使用率
- adUptime             # 系统运行时间
- adInterfaceName      # 网口名称
- adInterfaceBitIn     # 网口上行数据

max_repetitions: 25
retries: 3
timeout: 5s
version: 2 #snmpV2版本
auth:
community: public #设备团体名

lookups:

overrides:
sfSysDevName:
type: DisplayString

sfSysCpuCostRate:
type: DisplayString

sfCpuLoadLast1Min:
type: DisplayString

sfCpuTemp:
type: DisplayString

adStandByState:
type: DisplayString

adLinkName:
type: DisplayString

5)使用配置器生成snmp.yml配置文件 (用docker生成)

docker run -it -v "${PWD}:/opt/" prom/snmp-generator:master generate

9、配置nginx代理

1)进入nginx文件夹修改配置信息

目录结构[root@prometheus-2 nginx]# tree

├── conf.d

│   └── default.conf #nginx.config配置文件

├── html

│   └── index.html #访问展示页面

└── image #存放image图像

├── alertmanager.png

├── consul.png

├── grafana.jpg

├── x.x.x.x.png

└── prometheus.png

vim nginx/conf.d/default.conf

server {listen       80;listen  [::]:80;server_name  localhost;#access_log  /var/log/nginx/host.access.log  main;location / {    root   /usr/share/nginx/html;    index  index.html index.htm;}#error_page  404              /404.html;# redirect server error pages to the static page /50x.html#error_page   500 502 503 504  /50x.html;location = /50x.html {    root   /usr/share/nginx/html;}# Add a new location for image storage                     #配置可以识别image图片location /image/ {    alias /image/;}# proxy the PHP scripts to Apache listening on 127.0.0.1:80##location ~ \.php$ {#    proxy_pass   http://127.0.0.1;#}# pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000##location ~ \.php$ {#    root           html;#    fastcgi_pass   127.0.0.1:9000;#    fastcgi_index  index.php;#    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;#    include        fastcgi_params;#}# deny access to .htaccess files, if Apache's document root# concurs with nginx's one##location ~ /\.ht {#    deny  all;#}

2)配置访问展示页面

  1. [root@prometheus-2 html]# cat index.html
  2. Monitoring
  3. prometheus
  4. Prometheus Monitoring
  5. Click to Enter
  • grafana
  • Grafana Dashboard
  • Click to Enter
  • alertmanager
  • Alertmanager Notifications
  • Click to Enter
  • consul
  • Consul Service Discovery
  • Click to Enter
  • grafana
  • IOA-Grafana
  • Click to Enter
  • 10、可用性验证

    1)访问nginx代理域名

    2)prometheus展示

    3)grafana展示

    4)alertmanager展示

    5)consul展示

    来源地址:https://blog.csdn.net/2303_77150012/article/details/130337807

    阅读原文内容投诉

    免责声明:

    ① 本站未注明“稿件来源”的信息均来自网络整理。其文字、图片和音视频稿件的所属权归原作者所有。本站收集整理出于非商业性的教育和科研之目的,并不意味着本站赞同其观点或证实其内容的真实性。仅作为临时的测试数据,供内部测试之用。本站并未授权任何人以任何方式主动获取本站任何信息。

    ② 本站未注明“稿件来源”的临时测试数据将在测试完成后最终做删除处理。有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341

    软考中级精品资料免费领

    • 2024年上半年信息系统项目管理师第二批次真题及答案解析(完整版)

      难度     807人已做
      查看
    • 【考后总结】2024年5月26日信息系统项目管理师第2批次考情分析

      难度     351人已做
      查看
    • 【考后总结】2024年5月25日信息系统项目管理师第1批次考情分析

      难度     314人已做
      查看
    • 2024年上半年软考高项第一、二批次真题考点汇总(完整版)

      难度     433人已做
      查看
    • 2024年上半年系统架构设计师考试综合知识真题

      难度     221人已做
      查看

    相关文章

    发现更多好内容

    猜你喜欢

    AI推送时光机
    位置:首页-资讯-服务器
    咦!没有更多了?去看看其它编程学习网 内容吧
    首页课程
    资料下载
    问答资讯