| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
|
ok
|
|
53.773s ago
|
383.2us |
| alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
|
ok
|
|
53.773s ago
|
73.2us |
| alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
|
ok
|
|
53.773s ago
|
113.3us |
| alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum by(grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job="etcd"}[5m]))) > 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests
|
ok
|
|
53.773s ago
|
179.9us |
| alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
|
ok
|
|
53.773s ago
|
129us |
| alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
|
ok
|
|
53.773s ago
|
105.6us |
| alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests
|
ok
|
|
53.773s ago
|
69.1us |
| alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow
|
ok
|
|
53.773s ago
|
59.2us |
| alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
|
ok
|
|
53.773s ago
|
77.7us |
| alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
|
ok
|
|
53.774s ago
|
1.628ms |
| alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
|
ok
|
|
53.772s ago
|
1.433ms |
|
26.954s ago |
2.98ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
|
ok
|
|
18.078s ago
|
451.5us |
| alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
ok
|
|
18.078s ago
|
698.4us |
| alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
ok
|
|
18.078s ago
|
436.4us |
| alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
|
ok
|
|
18.077s ago
|
602.7us |
| alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit
|
ok
|
|
18.077s ago
|
87.1us |
|
10.326s ago |
95.3ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: pod_name:container_memory_usage_bytes:sum
expr: sum by(pod_name) (container_memory_usage_bytes{container_name!="POD",pod_name!=""})
|
ok
|
|
10.326s ago
|
11.85ms |
| record: pod_name:container_spec_cpu_shares:sum
expr: sum by(pod_name) (container_spec_cpu_shares{container_name!="POD",pod_name!=""})
|
ok
|
|
10.315s ago
|
9.332ms |
| record: pod_name:container_cpu_usage:sum
expr: sum by(pod_name) (rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
ok
|
|
10.305s ago
|
16.95ms |
| record: pod_name:container_fs_usage_bytes:sum
expr: sum by(pod_name) (container_fs_usage_bytes{container_name!="POD",pod_name!=""})
|
ok
|
|
10.288s ago
|
535.7us |
| record: namespace:container_memory_usage_bytes:sum
expr: sum by(namespace) (container_memory_usage_bytes{container_name!=""})
|
ok
|
|
10.288s ago
|
3.533ms |
| record: namespace:container_spec_cpu_shares:sum
expr: sum by(namespace) (container_spec_cpu_shares{container_name!=""})
|
ok
|
|
10.285s ago
|
2.751ms |
| record: namespace:container_cpu_usage:sum
expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
ok
|
|
10.282s ago
|
15.99ms |
| record: cluster:memory_usage:ratio
expr: sum by(cluster) (container_memory_usage_bytes{container_name!="POD",pod_name!=""}) / sum by(cluster) (machine_memory_bytes)
|
ok
|
|
10.266s ago
|
8.848ms |
| record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 / sum(machine_cpu_cores)
|
ok
|
|
10.257s ago
|
6.608ms |
| record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) / sum(machine_cpu_cores)
|
ok
|
|
10.251s ago
|
16.71ms |
| record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.99"
|
ok
|
|
10.234s ago
|
183.1us |
| record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.9"
|
ok
|
|
10.234s ago
|
106.2us |
| record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / 1e+06
labels:
quantile: "0.5"
|
ok
|
|
10.234s ago
|
97.8us |
| alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
|
ok
|
|
10.234s ago
|
199.8us |
| alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
|
ok
|
|
10.234s ago
|
152.5us |
| alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{ $value }}% of requests
|
ok
|
|
10.234s ago
|
220.5us |
| alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{ $value }}% of requests
|
ok
|
|
10.234s ago
|
186.7us |
| alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service discovery
|
ok
|
|
10.234s ago
|
532.6us |
| alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
labels:
severity: warning
annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
ok
|
|
10.235s ago
|
247.1us |
| alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
labels:
severity: critical
annotations:
description: Kubernetes API Certificate is expiring in less than 1 day
|
ok
|
|
10.235s ago
|
195.6us |
|
32.549s ago |
2.088ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| record: instance:node_cpu:rate:sum
expr: sum by(instance) (rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
ok
|
|
32.55s ago
|
436.7us |
| record: instance:node_filesystem_usage:sum
expr: sum by(instance) ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
ok
|
|
32.549s ago
|
273.4us |
| record: instance:node_network_receive_bytes:rate:sum
expr: sum by(instance) (rate(node_network_receive_bytes[3m]))
|
ok
|
|
32.549s ago
|
112.8us |
| record: instance:node_network_transmit_bytes:rate:sum
expr: sum by(instance) (rate(node_network_transmit_bytes[3m]))
|
ok
|
|
32.549s ago
|
92.6us |
| record: instance:node_cpu:ratio
expr: sum without(cpu, mode) (rate(node_cpu{mode!="idle"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu))
|
ok
|
|
32.549s ago
|
191.2us |
| record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
ok
|
|
32.549s ago
|
108.4us |
| record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum by(instance, cpu) (node_cpu))
|
ok
|
|
32.549s ago
|
91.7us |
| alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery
|
ok
|
|
32.549s ago
|
266us |
| alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
ok
|
|
32.549s ago
|
247.2us |
| alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
ok
|
|
32.55s ago
|
138.2us |
| alert: InactiveRAIDDisk
expr: node_md_disks - node_md_disks_active > 0
for: 10m
labels:
severity: warning
annotations:
description: '{{$value}} RAID disk(s) on node {{$labels.instance}} are inactive'
|
ok
|
|
32.55s ago
|
91.4us |
|
48.219s ago |
1.863ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
ok
|
|
48.22s ago
|
259.1us |
| alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}
|
ok
|
|
48.22s ago
|
297.8us |
| alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
ok
|
|
48.219s ago
|
137.4us |
| alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
ok
|
|
48.22s ago
|
129.3us |
| alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers
|
ok
|
|
48.22s ago
|
410.7us |
| alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
|
ok
|
|
48.219s ago
|
264.5us |
| alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
|
ok
|
|
48.219s ago
|
183.2us |
| alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).'
summary: Prometheus write-ahead log is corrupted
|
ok
|
|
48.219s ago
|
55.7us |
| alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.
summary: Prometheus isn't ingesting samples
|
ok
|
|
48.219s ago
|
104.2us |