Alerts


/etc/prometheus/rules/alertmanager.rules.yaml > alertmanager.rules
AlertmanagerConfigInconsistent (0 active)
alert: AlertmanagerConfigInconsistent
expr: count_values by(service) ("config_hash", alertmanager_config_hash) / on(service) group_left() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
  severity: critical
annotations:
  description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
AlertmanagerDownOrMissing (0 active)
alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / on(job) group_right() sum by(job) (up) != 1
for: 5m
labels:
  severity: warning
annotations:
  description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
AlertmanagerFailedReload (0 active)
alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
  severity: warning
annotations:
  description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
/etc/prometheus/rules/etcd3.rules.yaml > ./etcd3.rules
EtcdMemberCommunicationSlow (0 active)
alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
  severity: warning
annotations:
  description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
  summary: etcd member communication is slow
GRPCRequestsSlow (0 active)
alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum by(grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job="etcd"}[5m]))) > 0.15
for: 10m
labels:
  severity: critical
annotations:
  description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
  summary: slow gRPC requests
HTTPRequestsSlow (0 active)
alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
  severity: warning
annotations:
  description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
  summary: slow HTTP requests
HighCommitDurations (0 active)
alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
  severity: warning
annotations:
  description: etcd instance {{ $labels.instance }} commit durations are high
  summary: high commit durations
HighFsyncDurations (0 active)
alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
  severity: warning
annotations:
  description: etcd instance {{ $labels.instance }} fync durations are high
  summary: high fsync durations
HighNumberOfFailedHTTPRequests (0 active)
alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
  summary: a high number of HTTP requests are failing
HighNumberOfFailedHTTPRequests (0 active)
alert: HighNumberOfFailedHTTPRequests
expr: sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
for: 10m
labels:
  severity: warning
annotations:
  description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
  summary: a high number of HTTP requests are failing
HighNumberOfFailedProposals (0 active)
alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
  severity: warning
annotations:
  description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
  summary: a high number of proposals within the etcd cluster are failing
HighNumberOfLeaderChanges (0 active)
alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
  severity: warning
annotations:
  description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
  summary: a high number of leader changes within the etcd cluster are happening
InsufficientMembers (0 active)
alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
  severity: critical
annotations:
  description: If one more etcd member goes down the cluster will be unavailable
  summary: etcd cluster insufficient members
NoLeader (0 active)
alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
  severity: critical
annotations:
  description: etcd member {{ $labels.instance }} has no leader
  summary: etcd member has no leader
/etc/prometheus/rules/general.rules.yaml > general.rules
FdExhaustionClose (0 active)
alert: FdExhaustionClose
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
for: 10m
labels:
  severity: warning
annotations:
  description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance will exhaust in file/socket descriptors within the next 4 hours'
  summary: file descriptors soon exhausted
FdExhaustionClose (0 active)
alert: FdExhaustionClose
expr: predict_linear(fd_utilization[10m], 3600) > 1
for: 10m
labels:
  severity: critical
annotations:
  description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance will exhaust in file/socket descriptors within the next hour'
  summary: file descriptors soon exhausted
TargetDown (0 active)
alert: TargetDown
expr: 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
for: 10m
labels:
  severity: warning
annotations:
  description: '{{ $value }}% of {{ $labels.job }} targets are down.'
  summary: Targets are down
/etc/prometheus/rules/kube-state-metrics.rules.yaml > kube-state-metrics.rules
DaemonSetRolloutStuck (1 active)
alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100
for: 15m
labels:
  severity: warning
annotations:
  description: Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespaces}}/{{$labels.daemonset}}
  summary: DaemonSet is missing pods
Labels State Active Since Value
alertname="DaemonSetRolloutStuck" app_kubernetes_io_created_by="resource-stack" app_kubernetes_io_managed_by="Lens" app_kubernetes_io_name="lens-metrics" daemonset="kube-prometheus-node-exporter" instance="10.1.1.171:8080" job="kube-state-metrics" kubernetes_namespace="prometheus" kubernetes_node="aks-agentpool19-14716633-vmss00000l" name="kube-state-metrics" namespace="prometheus" severity="warning" firing 2025-11-26 09:49:29.003005996 +0000 UTC 83.33333333333334
DaemonSetsMissScheduled (0 active)
alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
  severity: warning
annotations:
  description: A number of daemonsets are running where they are not supposed to run.
  summary: Daemonsets are not scheduled correctly
DeploymentGenerationMismatch (0 active)
alert: DeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 15m
labels:
  severity: warning
annotations:
  description: Observed deployment generation does not match expected one for deployment {{$labels.namespaces}}/{{$labels.deployment}}
  summary: Deployment is outdated
DeploymentReplicasNotUpdated (0 active)
alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
  severity: warning
annotations:
  description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
  summary: Deployment replicas are outdated
K8SDaemonSetsNotScheduled (0 active)
alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
  severity: warning
annotations:
  description: A number of daemonsets are not scheduled.
  summary: Daemonsets are not scheduled correctly
PodFrequentlyRestarting (0 active)
alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 10m
labels:
  severity: warning
annotations:
  description: Pod {{$labels.namespaces}}/{{$labels.pod}} restarted {{$value}} times within the last hour
  summary: Pod is restarting frequently
/etc/prometheus/rules/kubelet.rules.yaml > kubelet.rules
K8SKubeletDown (0 active)
alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
  severity: warning
annotations:
  description: Prometheus failed to scrape {{ $value }}% of kubelets.
K8SKubeletDown (0 active)
alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10
for: 1h
labels:
  severity: critical
annotations:
  description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
  summary: Many Kubelets cannot be scraped
K8SKubeletTooManyPods (0 active)
alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
  severity: warning
annotations:
  description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
  summary: Kubelet is close to pod limit
K8SManyNodesNotReady (0 active)
K8SNodeNotReady (0 active)
alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
  severity: warning
annotations:
  description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
  summary: Node status is NotReady
/etc/prometheus/rules/kubernetes.rules.yaml > kubernetes.rules
APIServerErrorsHigh (0 active)
alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2
for: 10m
labels:
  severity: warning
annotations:
  description: API server returns errors for {{ $value }}% of requests
APIServerErrorsHigh (0 active)
alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5
for: 10m
labels:
  severity: critical
annotations:
  description: API server returns errors for {{ $value }}% of requests
APIServerLatencyHigh (0 active)
alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
  severity: warning
annotations:
  description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
APIServerLatencyHigh (0 active)
alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
  severity: critical
annotations:
  description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}
K8SApiserverDown (0 active)
alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 20m
labels:
  severity: critical
annotations:
  description: No API servers are reachable or all have disappeared from service discovery
K8sCertificateExpirationNotice (0 active)
alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
labels:
  severity: warning
annotations:
  description: Kubernetes API Certificate is expiring soon (less than 7 days)
K8sCertificateExpirationNotice (0 active)
alert: K8sCertificateExpirationNotice
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
labels:
  severity: critical
annotations:
  description: Kubernetes API Certificate is expiring in less than 1 day
/etc/prometheus/rules/node.rules.yaml > node.rules
InactiveRAIDDisk (0 active)
alert: InactiveRAIDDisk
expr: node_md_disks - node_md_disks_active > 0
for: 10m
labels:
  severity: warning
annotations:
  description: '{{$value}} RAID disk(s) on node {{$labels.instance}} are inactive'
NodeDiskRunningFull (0 active)
alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
  severity: warning
annotations:
  description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})
NodeDiskRunningFull (0 active)
alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
  severity: critical
annotations:
  description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})
NodeExporterDown (0 active)
alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery
/etc/prometheus/rules/prometheus.rules.yaml > prometheus.rules
PrometheusNotConnectedToAlertmanagers (1 active)
alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers
Labels State Active Since Value
alertname="PrometheusNotConnectedToAlertmanagers" app_kubernetes_io_created_by="resource-stack" app_kubernetes_io_managed_by="Lens" app_kubernetes_io_name="lens-metrics" instance="10.1.1.84:9090" job="prometheus" kubernetes_namespace="lens-metrics" kubernetes_node="aks-agentpool19-14716633-vmss00000l" severity="warning" firing 2024-07-01 09:35:18.442708538 +0000 UTC 0
PrometheusConfigReloadFailed (0 active)
alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
  severity: warning
annotations:
  description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
PrometheusErrorSendingAlerts (0 active)
alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01
for: 10m
labels:
  severity: warning
annotations:
  description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
PrometheusErrorSendingAlerts (0 active)
alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03
for: 10m
labels:
  severity: critical
annotations:
  description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
PrometheusNotIngestingSamples (0 active)
alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.
  summary: Prometheus isn't ingesting samples
PrometheusNotificationQueueRunningFull (0 active)
alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}
PrometheusTSDBCompactionsFailing (0 active)
alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
  severity: warning
annotations:
  description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.'
  summary: Prometheus has issues compacting sample blocks
PrometheusTSDBReloadsFailing (0 active)
alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
  severity: warning
annotations:
  description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.'
  summary: Prometheus has issues reloading data blocks from disk
PrometheusTSDBWALCorruptions (0 active)
alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
  severity: warning
annotations:
  description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).'
  summary: Prometheus write-ahead log is corrupted