| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
|
ok
|
|
5.582s ago
|
358.4us |
| alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
|
ok
|
|
5.582s ago
|
67.6us |
| alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
|
ok
|
|
5.582s ago
|
106.4us |
| alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum by(grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job="etcd"}[5m]))) > 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests
|
ok
|
|
5.582s ago
|
127.8us |
| alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
|
ok
|
|
5.582s ago
|
118.7us |
| alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
|
ok
|
|
5.582s ago
|
114.1us |
| alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests
|
ok
|
|
5.582s ago
|
100.5us |
| alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow
|
ok
|
|
5.582s ago
|
72.9us |
| alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
|
ok
|
|
5.582s ago
|
62.4us |
| alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
|
ok
|
|
5.582s ago
|
1.686ms |
| alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
|
ok
|
|
5.581s ago
|
1.807ms |
|
38.762s ago |
3.347ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
|
ok
|
|
29.886s ago
|
286.4us |
| alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
ok
|
|
29.886s ago
|
484.8us |
| alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
ok
|
|
29.885s ago
|
277.9us |
| alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
|
ok
|
|
29.885s ago
|
589us |
| alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit
|
ok
|
|
29.885s ago
|
82.4us |
|
22.134s ago |
139.1ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: pod_name:container_memory_usage_bytes:sum
expr: sum by(pod_name) (container_memory_usage_bytes{container_name!="POD",pod_name!=""})
|
ok
|
|
22.134s ago
|
18.12ms |
| record: pod_name:container_spec_cpu_shares:sum
expr: sum by(pod_name) (container_spec_cpu_shares{container_name!="POD",pod_name!=""})
|
ok
|
|
22.116s ago
|
10.56ms |
| record: pod_name:container_cpu_usage:sum
expr: sum by(pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
ok
|
|
22.106s ago
|
25.87ms |
| record: pod_name:container_fs_usage_bytes:sum
expr: sum by(pod_name) (container_fs_usage_bytes{container_name!="POD",pod_name!=""})
|
ok
|
|
22.08s ago
|
433.4us |
| record: namespace:container_memory_usage_bytes:sum
expr: sum by(namespace) (container_memory_usage_bytes{container_name!=""})
|
ok
|
|
22.08s ago
|
6.15ms |
| record: namespace:container_spec_cpu_shares:sum
expr: sum by(namespace) (container_spec_cpu_shares{container_name!=""})
|
ok
|
|
22.074s ago
|
3.617ms |
| record: namespace:container_cpu_usage:sum
expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
ok
|
|
22.07s ago
|
25.81ms |
| record: cluster:memory_usage:ratio
expr: sum by(cluster) (container_memory_usage_bytes{container_name!="POD",pod_name!=""}) / sum by(cluster) (machine_memory_bytes)
|
ok
|
|
22.044s ago
|
15.43ms |
| record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 / sum(machine_cpu_cores)
|
ok
|
|
22.029s ago
|
8.087ms |
| record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) / sum(machine_cpu_cores)
|
ok
|
|
22.021s ago
|
23.9ms |
| record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.99"
|
ok
|
|
21.997s ago
|
165.4us |
| record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.9"
|
ok
|
|
21.997s ago
|
59.8us |
| record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.5"
|
ok
|
|
21.997s ago
|
53.8us |
| alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
|
ok
|
|
21.997s ago
|
131.7us |
| alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
|
ok
|
|
21.997s ago
|
92.6us |
| alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{ $value }}% of requests
|
ok
|
|
21.997s ago
|
184.8us |
| alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{ $value }}% of requests
|
ok
|
|
21.997s ago
|
108.6us |
| alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service discovery
|
ok
|
|
21.997s ago
|
66.8us |
| alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
labels:
severity: warning
annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
ok
|
|
21.997s ago
|
126.4us |
| alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
labels:
severity: critical
annotations:
description: Kubernetes API Certificate is expiring in less than 1 day
|
ok
|
|
21.997s ago
|
108.7us |
|
44.352s ago |
2.246ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance:node_cpu:rate:sum
expr: sum by(instance) (rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
ok
|
|
44.352s ago
|
470.4us |
| record: instance:node_filesystem_usage:sum
expr: sum by(instance) ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
ok
|
|
44.352s ago
|
208us |
| record: instance:node_network_receive_bytes:rate:sum
expr: sum by(instance) (rate(node_network_receive_bytes[3m]))
|
ok
|
|
44.351s ago
|
96.3us |
| record: instance:node_network_transmit_bytes:rate:sum
expr: sum by(instance) (rate(node_network_transmit_bytes[3m]))
|
ok
|
|
44.351s ago
|
298.1us |
| record: instance:node_cpu:ratio
expr: sum without(cpu, mode) (rate(node_cpu{mode!="idle"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu))
|
ok
|
|
44.351s ago
|
385.7us |
| record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
ok
|
|
44.351s ago
|
92.1us |
| record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum by(instance, cpu) (node_cpu))
|
ok
|
|
44.351s ago
|
83.1us |
| alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery
|
ok
|
|
44.351s ago
|
216us |
| alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
ok
|
|
44.351s ago
|
173.6us |
| alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
ok
|
|
44.351s ago
|
115.7us |
| alert: InactiveRAIDDisk
expr: node_md_disks - node_md_disks_active > 0
for: 10m
labels:
severity: warning
annotations:
description: '{{$value}} RAID disk(s) on node {{$labels.instance}} are inactive'
|
ok
|
|
44.351s ago
|
77.9us |
|
23ms ago |
1.983ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
ok
|
|
23ms ago
|
251.9us |
| alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}
|
ok
|
|
23ms ago
|
304us |
| alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
ok
|
|
23ms ago
|
175.9us |
| alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
ok
|
|
23ms ago
|
120.3us |
| alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers
|
ok
|
|
23ms ago
|
365.7us |
| alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
|
ok
|
|
22ms ago
|
309.8us |
| alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
|
ok
|
|
22ms ago
|
258.8us |
| alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).'
summary: Prometheus write-ahead log is corrupted
|
ok
|
|
22ms ago
|
60.7us |
| alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.
summary: Prometheus isn't ingesting samples
|
ok
|
|
22ms ago
|
112.2us |