Reference:
1、gitHub上Prometheus文档 https://prometheus-operator.dev/docs/prologue/quick-start/
2、公司内部 https://gitlab.cd.xsio.cn/devops/prometheus-operator
3、钉钉告警参考 https://www.cnblogs.com/weifeng1463/p/16070899.html
注意: 不需要执行 https://gitlab.cd.xsio.cn/devops/prometheus-operator 的 1-4步。
执行 wget https://github.com/prometheus-operator/kube-prometheus/archive/main.zip
执行 unzip main.zip
执行 ll kube-prometheus-main/
#Create the namespace and CRDs, and then wait for them to be availble before creating the remaining resources
执行 kubectl create -f manifests/setup
如果客户环境只能连接国内网络,有下面两个镜像是拉取不到的,需要更改两个文件里的镜像内容
执行 vi kube-prometheus-main/manifests/prometheusAdapter-deployment.yaml
将
image: k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1
改为
image: registry.cn-hangzhou.aliyuncs.com/clab-docker/prometheus-operator:prometheus-adapterv91
执行 vi kube-prometheus-main/manifests/kubeStateMetrics-deployment.yaml
将
image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.5.0
改为
image: registry.cn-hangzhou.aliyuncs.com/clab-docker/kube-state-metrics:v242
如果Grafana web放在域名子路径下如"https://app.xxx.com/grafana",需修改Grafana环境变量
vi manifests/grafana-deployment.yaml
spec:
containers:
- env:
- name: GF_SERVER_SERVE_FROM_SUB_PATH
value: "true"
- name: GF_SERVER_ROOT_URL
value: "%(protocol)s://%(domain)s/grafana"
执行 kubectl create -f manifests/
执行 kubectl get pod -nmonitoring
NAME READY STATUS RESTARTS AGE
alertmanager-main-0 2/2 Running 0 22d
blackbox-exporter-6798fb5bb4-8qtvc 3/3 Running 0 22d
grafana-78d8cfccff-58hhf 1/1 Running 0 22d
kube-state-metrics-7fd969fc67-qp42w 4/4 Running 0 22d
node-exporter-j5znt 2/2 Running 0 22d
node-exporter-kqrgf 2/2 Running 0 22d
node-exporter-s8ffc 2/2 Running 0 22d
node-exporter-sv7kz 2/2 Running 0 22d
prometheus-adapter-7766d4f756-sqfqq 1/1 Running 0 22d
prometheus-adapter-7766d4f756-v8p4b 1/1 Running 0 22d
prometheus-k8s-0 2/2 Running 0 22d
prometheus-k8s-1 2/2 Running 0 22d
prometheus-operator-7ddc6877d5-cjlvx 2/2 Running 0 22d
测试
wget -O - grafana.monitoring:3000
curl grafana.monitoring:3000
注意:阿里云环境如果客户启用了阿里云k8s的prometheus node-exporter,可能会碰到hostport端口冲突的问题,node-exporter无法运行,可以修改 manifests/nodeExporter-service.yaml 的端口。或使用阿里云自带的node监控即可。
下载公司内部配置包,更改tls-cipher和镜像
其中tls-cipher和kube-rbac-proxy的值是在kube-prometheus-main/manifests/prometheusOperator-deployment.yaml中获取的
执行 vi prometheus-operator-master/8-custom-exporter/kafka-exporter/kafka-exporter-deployment.yaml
...
...
#- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:9308/
env:
- name: IP
valueFrom:
fieldRef:
fieldPath: status.podIP
#image: registry.cn-hangzhou.aliyuncs.com/clab-docker/kube-rbac-proxy:v0.4.1
image: quay.io/brancz/kube-rbac-proxy:v0.12.0
...
...
用 https://gitlab.cd.xsio.cn/devops/prometheus-operator 的 8-custom-exporter 部署 kafka-exporter
vi 8-custom-exporter/kafka-exporter/kafka-exporter-deployment.yaml
# 编辑kafka地址,如果只有一个节点,请删除多余行
kubectl apply -f 8-custom-exporter/kafka-exporter
kubectl -nmonitoring get pod -l app=kafka-exporter # 确认状态为Running
部署完登录上grafana之后,需要导入"prometheus-operator\10-grafana-dashboard\Kafka-1585242659117.json"
测试
curl https://kafka-exporter.monitoring:9308/metrics
curl http://kafka-exporter.monitoring:9308/metrics
#Unauthorized
klc kafka-exporter kafka-exporter
vi 8-custom-exporter/yarn-exporter/yarn-exporter-deployment.yaml
# 修改--resource-manager.address后的yarn resource manager地址,一般为8088端口
kubectl apply -f 8-custom-exporter/yarn-exporter
kubectl -nmonitoring get pod -l app=yarn-exporter # 确认状态为Running
测试
curl emr-header-1:8088 -L | more
curl emr-header-2:8088 -L | more
curl http://yarn-exporter.monitoring:9877/metrics
#Unauthorized
klc yarn-exporter yarn-exporter
操作方法同上
执行 kubectl eidt secret alertmanager-main -n monitoring
更改:
alertmanager.yaml:Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0KcmVjZWl2ZXJzOgotIG5hbWU6IGRpbmd0YWxrCiAgd2ViaG9va19jb25maWdzOgogICAgLSB1cmw6IGh0dHA6Ly9wcm9tZXRoZXVzLXdlYmhvb2stZGluZ3RhbGs6ODA2MC9kaW5ndGFsay93ZWJob29rMS9zZW5kCiAgICAgIHNlbmRfcmVzb2x2ZWQ6IHRydWUKCnJvdXRlOgogIGdyb3VwX2J5OgogICAgLSBhbGVydG5hbWUKICBncm91cF93YWl0OiAiMzBtIgogIGdyb3VwX2ludGVydmFsOiAiNjBtIgogIHJlcGVhdF9pbnRlcnZhbDogIjI0aCIKICByZWNlaXZlcjogImRpbmd0YWxrIgogIHJvdXRlczoKICAgIC0gcmVjZWl2ZXI6ICdkaW5ndGFsaycKICAgICAgZ3JvdXBfd2FpdDogIjIwbSIKICAgICAgZ3JvdXBfaW50ZXJ2YWw6ICI0MG0iCiAgICAgIHJlcGVhdF9pbnRlcnZhbDogIjZoIgogICAgICBtYXRjaF9yZToKICAgICAgICBzZXZlcml0eTogbWVkaXVtfHdhcm5pbmcKICAgIC0gcmVjZWl2ZXI6ICJkaW5ndGFsayIKICAgICAgZ3JvdXBfd2FpdDogIjVtIgogICAgICBncm91cF9pbnRlcnZhbDogIjEwbSIKICAgICAgcmVwZWF0X2ludGVydmFsOiAiMWgiCiAgICAgIG1hdGNoX3JlOgogICAgICAgIHNldmVyaXR5OiBoaWdofGNyaXRpY2Fs
执行 kubectl rollout restart statefulset.apps/alertmanager-main -nmonitoring
执行 vi prometheus-webhook-dingtalk.yaml ,注意更改ding.profile=webhook1=https部分
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: prometheus-webhook-dingtalk
template:
metadata:
labels:
app: prometheus-webhook-dingtalk
spec:
containers:
- name: prometheus-webhook-dingtalk
image: timonwong/prometheus-webhook-dingtalk:v0.3.0
imagePullPolicy: IfNotPresent
args:
- --ding.profile=webhook1=https://oapi.dingtalk.com/robot/send?access_token=2ca158b715f95c29fa95ae30a251cb01d01b24b5497f110470d21b313689c0a2
- --template.file=/usr/share/prometheus-webhook-dingtalk/template/webhook-dingtalk.tmpl
volumeMounts:
- mountPath: /usr/share/prometheus-webhook-dingtalk/template/
name: webhook-dingtalk-template
ports:
- containerPort: 8060
protocol: TCP
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 200m
memory: 1000Mi
volumes:
- name: webhook-dingtalk-template
configMap:
name: webhook-dingtalk-template
defaultMode: 420
---
apiVersion: v1
kind: Service
metadata:
labels:
app: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: monitoring
spec:
ports:
- port: 8060
protocol: TCP
targetPort: 8060
selector:
app: prometheus-webhook-dingtalk
sessionAffinity: None
[root@ma-kafka test]# cat webhook-dingtalk.tmpl
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}
**报警内容**
{{ range .Labels.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**报警信息**
{{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
---
{{ end }}{{ end }}
{{ define "___text_alertresovle_list" }}{{ range . }}
---
**报警已恢复**
{{ range .Labels.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
---
{{ end }}{{ end }}
{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}
### k8s集群测试环境监控报警
---
{{ index .GroupLabels "alertname" }}
{{ if gt (len .Alerts.Firing) 0 -}}
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
{{ template "___text_alertresovle_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
执行 kubectl create configmap webhook-dingtalk-template --from-file=webhook-dingtalk.tmpl -n monitoring
部署好kube-prometheus之后,kubectl get nodes 可能无法获取节点数据。
kubectl edit apiservice v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: metrics-server
namespace: kube-system
或者用kubectl patch修改
kubectl patch apiservice v1beta1.metrics.k8s.io -p '{"spec":{"service":{"name":"metrics-server", "namespace":"kube-system"}}}'
检查
kubectl get apiservice v1beta1.metrics.k8s.io -oyaml