k8s-部署Prometheus(二)
//
# k8s-部署Prometheus(二)
# 部署prometheus服务
dockerhub镜像地址 (opens new window)
docker pull prom/prometheus:v2.32.1
docker tag prom/prometheus:v2.32.1 harbor.yfklife.cn/devops/rometheus:v2.32.1
docker push harbor.yfklife.cn/devops/rometheus:v2.32.1
1
2
3
2
3
# 准备资源配置
test -d /opt/application/prometheus/ || mkdir -p /opt/application/prometheus/ && cd /opt/application/prometheus/
- 配置 rbac.yaml
vi prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: prometheus
namespace: devops
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/cluster-service: "true"
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: devops
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
- 配置 deployment.yaml
vi prometheus-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "5"
labels:
name: prometheus
name: prometheus
namespace: devops
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 7
selector:
matchLabels:
app: prometheus
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app: prometheus
spec:
nodeName: hdss14-21.host.com # 指定服务运行在某个计算节点
containers:
- name: prometheus
image: harbor.yfklife.cn/devops/prometheus:v2.32.1
imagePullPolicy: IfNotPresent
command:
- /bin/prometheus
args:
- --config.file=/data/etc/prometheus.yml
- --storage.tsdb.path=/data/prometheus-tsdb
- --storage.tsdb.min-block-duration=10m # 只加载10分钟数据到内存,实际生产环境可以适当加大:2h
- --storage.tsdb.retention=72h # tsdb存储到磁盘多久时间数据,生产环境可以加大: 168h
- --web.enable-lifecycle # 后期修改配置,可以curl url重新加载配置
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: /data
name: data
resources:
requests:
cpu: "1000m"
memory: "1.5Gi"
limits:
cpu: "2000m"
memory: "8Gi"
imagePullSecrets:
- name: harbor
securityContext:
runAsUser: 0
serviceAccountName: prometheus
volumes:
- name: data
nfs:
server: 192.168.14.200
path: /data/k8s_share/prometheus
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
- 配置 service.yaml
vi prometheus-service.yaml
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: devops
spec:
ports:
- port: 9090
protocol: TCP
targetPort: 9090
selector:
app: prometheus
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
- 配置 ingress.yaml
vi prometheus-ingress.yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
annotations:
kubernetes.io/ingress.class: traefik
name: prometheus
namespace: devops
spec:
rules:
- host: prometheus.yfklife.cn
http:
paths:
- path: /
backend:
serviceName: prometheus
servicePort: 9090
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
- 创建资源,添加域名解析
kubectl apply -f prometheus-rbac.yaml
kubectl apply -f prometheus-deployment.yaml
kubectl apply -f prometheus-ingress.yaml
kubectl apply -f prometheus-service.yaml
1
2
3
4
2
3
4
- 访问Prometheus Dashboard
# 配置prometheus.yml
- k8s监控值参考配置
http://xxxxxxxxx/plugins/grafana-kubernetes-app/page/cluster-config
,改成你的grafana地址
- prometheus.yml
#[root@hdss14-200 etc]# head -20 prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager"]
rule_files:
- "/data/etc/rules.yml"
#k8s监控值,省略
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
- rules.yml
#[root@hdss14-200 etc]# cat rules.yml
groups:
- name: hostStatsAlert
rules:
- alert: PodMemUsagePercent
expr: ceil(container_memory_usage_bytes{container_label_io_kubernetes_docker_type="container"} / (container_spec_memory_limit_bytes != 0) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "NameSpace: {{ $labels.container_label_io_kubernetes_pod_namespace}} , PodName: {{ $labels.container_label_io_kubernetes_pod_name }} mem usage percent has exceeded 80% (current value: {{ $value }}%)"
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# 收集应用资源-Annotations
Annotation是用户任意定义的“附加”信息,以便客户端(如工具和库)可以检索此metadata.
Annotations就是注解,Annotation与Label类似,也使用key/value键值对的形式进行定义.
prometheus通过识别POD里面的Annotations注解,去做一个监控匹配,层次【spec->template->metadata,添加annotations注释】
- traefik
annotations:
prometheus_io_scheme: "traefik"
prometheus_io_path: "/metrics"
prometheus_io_port: "8080"
1
2
3
4
5
2
3
4
5
- Spinnaker提供者-TCP
annotations:
blackbox_port: "20880"
blackbox_scheme: "tcp"
prometheus_io_scrape: "true"
prometheus_io_port: "12346"
prometheus_io_path: "/"
1
2
3
4
5
6
2
3
4
5
6
- Spinnaker消费者-http
annotations:
blackbox_path: "/hello?name=health"
blackbox_port: "8080"
blackbox_scheme: "http"
prometheus_io_scrape: "true"
prometheus_io_port: "12346"
prometheus_io_path: "/"
1
2
3
4
5
6
7
2
3
4
5
6
7
官方说明annotaition说明 (opens new window)
# 部署alertmanager服务
dockerhub镜像地址 (opens new window)
docker pull prom/alertmanager:v0.22.0
docker tag prom/alertmanager:v0.22.0 harbor.yfklife.cn/devops/alertmanager:v0.22.0
docker push harbor.yfklife.cn/devops/alertmanager:v0.22.0
1
2
3
4
2
3
4
# 准备资源配置
- 配置 configmap.yaml
vi alertmanager-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: devops
data:
config.yml: |
global:
resolve_timeout: 5m
smtp_from: 'yfk@163.com'
smtp_smarthost: 'smtp.163.com:25'
#smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: 'yfk@163.com'
smtp_auth_password: 'SHYLIRZZ'
smtp_require_tls: false
#smtp_hello: 'yfk@163.com'
templates:
- '/etc/alertmanager/*.tmpl'
route:
group_by: ['alertname', 'cluster']
group_wait: 20s
group_interval: 20s
repeat_interval: 12h
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '122xxxxxx@qq.com'
send_resolved: true
html: '{{ template "email.to.html" . }}'
headers: { Subject: " {{ .CommonLabels.instance }} {{ .CommonLabels.alertname }}" }
#headers: { Subject: " {{ .CommonLabels.instance }} {{ .CommonAnnotations.summary }}" }
email.tmpl: |
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{ range .Alerts }}
问题告警程序: prometheus_alert <br>
告警主题: {{ .Annotations.summary }} <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
故障主机: {{ .Labels.instance }} <br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{ range .Alerts }}
恢复告警程序: prometheus_alert <br>
告警主题: {{ .Annotations.summary }} <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
故障主机: {{ .Labels.instance }} <br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- end }}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
- 配置 deployment.yaml
vi alertmanager-deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: alertmanager
namespace: devops
spec:
replicas: 1
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
containers:
- name: alertmanager
image: harbor.yfklife.cn/devops/alertmanager:v0.22.0
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--cluster.advertise-address=0.0.0.0:9093"
- "--storage.path=/alertmanager"
ports:
- name: alertmanager
containerPort: 9093
volumeMounts:
- name: alertmanager-cm
mountPath: /etc/alertmanager
volumes:
- name: alertmanager-cm
configMap:
name: alertmanager-config
imagePullSecrets:
- name: harbor
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
- 配置 service.yaml
vi alertmanager-service.yaml
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: devops
spec:
selector:
app: alertmanager
ports:
- port: 80
targetPort: 9093
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
- 创建资源
kubectl apply -f alertmanager-configmap.yaml
kubectl apply -f alertmanager-deployment.yaml
kubectl apply -f alertmanager-service.yaml
1
2
3
2
3
# 刷新Prometheus配置
curl -X POST http://prometheus.yfklife.cn/-/reload
# 测试邮件告警
- 创建一个deployment
[root@hdss14-21 prometheus]# cat test.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-deployment
spec:
selector:
matchLabels:
app: test
replicas: 1
template:
metadata:
labels:
app: test
spec:
containers:
- name: test
image: harbor.yfklife.cn/public/ubuntu:20.04
#image: ubuntu:20.04
command: ["tail"]
args: ["-f","/var/log/alternatives.log"]
ports:
- containerPort: 80
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 100m
memory: 50Mi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
- pod里安装命令 "stress",测试
kubectl exec -it -n default `kubectl get pod |grep test-deployment |awk '{print $1}'` bash
apt install stress -y
stress --vm 4 --vm-bytes 60M --vm-keep #根据上面的内存限制,压测240M就会触发告警
1
2
3
2
3
- 等待邮件消息
-----------告警信息-------------
如果对你有帮助,给博主买杯咖啡吧
//
如果此文章对您有帮助,点击 -->> 请博主喝咖啡 (opens new window)
上次更新: 2022/06/12, 18:36:37