| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
|
ok
|
|
20.44s ago
|
331.9us |
| alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
|
ok
|
|
20.44s ago
|
51.4us |
| alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
|
ok
|
|
20.44s ago
|
80.7us |
| alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum by(grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job="etcd"}[5m]))) > 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests
|
ok
|
|
20.44s ago
|
124.5us |
| alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
|
ok
|
|
20.44s ago
|
116.7us |
| alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
|
ok
|
|
20.44s ago
|
113.1us |
| alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests
|
ok
|
|
20.44s ago
|
107.7us |
| alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow
|
ok
|
|
20.44s ago
|
65.6us |
| alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
|
ok
|
|
20.44s ago
|
64.9us |
| alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
|
ok
|
|
20.44s ago
|
1.241ms |
| alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
|
ok
|
|
20.439s ago
|
1.402ms |
|
53.62s ago |
2.62ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
|
ok
|
|
44.744s ago
|
361.7us |
| alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
ok
|
|
44.744s ago
|
396.9us |
| alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
ok
|
|
44.744s ago
|
254.3us |
| alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
|
ok
|
|
44.743s ago
|
331.1us |
| alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit
|
ok
|
|
44.743s ago
|
57.3us |
|
36.993s ago |
165.4ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: pod_name:container_memory_usage_bytes:sum
expr: sum by(pod_name) (container_memory_usage_bytes{container_name!="POD",pod_name!=""})
|
ok
|
|
36.993s ago
|
21.56ms |
| record: pod_name:container_spec_cpu_shares:sum
expr: sum by(pod_name) (container_spec_cpu_shares{container_name!="POD",pod_name!=""})
|
ok
|
|
36.971s ago
|
10.63ms |
| record: pod_name:container_cpu_usage:sum
expr: sum by(pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
ok
|
|
36.961s ago
|
28.68ms |
| record: pod_name:container_fs_usage_bytes:sum
expr: sum by(pod_name) (container_fs_usage_bytes{container_name!="POD",pod_name!=""})
|
ok
|
|
36.932s ago
|
437us |
| record: namespace:container_memory_usage_bytes:sum
expr: sum by(namespace) (container_memory_usage_bytes{container_name!=""})
|
ok
|
|
36.932s ago
|
6.805ms |
| record: namespace:container_spec_cpu_shares:sum
expr: sum by(namespace) (container_spec_cpu_shares{container_name!=""})
|
ok
|
|
36.925s ago
|
3.04ms |
| record: namespace:container_cpu_usage:sum
expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
ok
|
|
36.922s ago
|
30.68ms |
| record: cluster:memory_usage:ratio
expr: sum by(cluster) (container_memory_usage_bytes{container_name!="POD",pod_name!=""}) / sum by(cluster) (machine_memory_bytes)
|
ok
|
|
36.891s ago
|
18.19ms |
| record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 / sum(machine_cpu_cores)
|
ok
|
|
36.873s ago
|
7.437ms |
| record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) / sum(machine_cpu_cores)
|
ok
|
|
36.867s ago
|
36.77ms |
| record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.99"
|
ok
|
|
36.83s ago
|
172.7us |
| record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.9"
|
ok
|
|
36.83s ago
|
63.3us |
| record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.5"
|
ok
|
|
36.83s ago
|
54.7us |
| alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
|
ok
|
|
36.83s ago
|
130.6us |
| alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
|
ok
|
|
36.83s ago
|
88.8us |
| alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{ $value }}% of requests
|
ok
|
|
36.83s ago
|
122.8us |
| alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{ $value }}% of requests
|
ok
|
|
36.83s ago
|
98.3us |
| alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service discovery
|
ok
|
|
36.83s ago
|
66.8us |
| alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
labels:
severity: warning
annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
ok
|
|
36.83s ago
|
122.8us |
| alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
labels:
severity: critical
annotations:
description: Kubernetes API Certificate is expiring in less than 1 day
|
ok
|
|
36.83s ago
|
103.1us |
|
59.211s ago |
1.543ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance:node_cpu:rate:sum
expr: sum by(instance) (rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
ok
|
|
59.212s ago
|
350.2us |
| record: instance:node_filesystem_usage:sum
expr: sum by(instance) ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
ok
|
|
59.211s ago
|
133.1us |
| record: instance:node_network_receive_bytes:rate:sum
expr: sum by(instance) (rate(node_network_receive_bytes[3m]))
|
ok
|
|
59.211s ago
|
84.1us |
| record: instance:node_network_transmit_bytes:rate:sum
expr: sum by(instance) (rate(node_network_transmit_bytes[3m]))
|
ok
|
|
59.211s ago
|
74.9us |
| record: instance:node_cpu:ratio
expr: sum without(cpu, mode) (rate(node_cpu{mode!="idle"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu))
|
ok
|
|
59.211s ago
|
161.5us |
| record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
ok
|
|
59.211s ago
|
83.2us |
| record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum by(instance, cpu) (node_cpu))
|
ok
|
|
59.211s ago
|
94.1us |
| alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery
|
ok
|
|
59.211s ago
|
199.9us |
| alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
ok
|
|
59.211s ago
|
174.5us |
| alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
ok
|
|
59.211s ago
|
92.4us |
| alert: InactiveRAIDDisk
expr: node_md_disks - node_md_disks_active > 0
for: 10m
labels:
severity: warning
annotations:
description: '{{$value}} RAID disk(s) on node {{$labels.instance}} are inactive'
|
ok
|
|
59.211s ago
|
62.5us |
|
14.883s ago |
1.382ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
ok
|
|
14.883s ago
|
211.2us |
| alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}
|
ok
|
|
14.883s ago
|
156.7us |
| alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
ok
|
|
14.883s ago
|
83.5us |
| alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
ok
|
|
14.883s ago
|
69.4us |
| alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers
|
ok
|
|
14.883s ago
|
321us |
| alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
|
ok
|
|
14.883s ago
|
231.8us |
| alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
|
ok
|
|
14.883s ago
|
169.9us |
| alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).'
summary: Prometheus write-ahead log is corrupted
|
ok
|
|
14.882s ago
|
41us |
| alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.
summary: Prometheus isn't ingesting samples
|
ok
|
|
14.883s ago
|
78.5us |