Skip to main content
Version: 2.4.15

PromQL Expression Reference

The PromQL expressions in this doc can be used to configure alerts.

For more information about querying the Prometheus time series database, refer to the official Prometheus documentation.

\<!-- TOC -->

\<!-- /TOC -->

Cluster Metrics

Cluster CPU Utilization#

CatalogExpression
Detail1 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))
Summary1 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])))

Cluster Load Average#

CatalogExpression
Detail\<table>\<tr>\<td>load1\</td>\<td>sum(node_load1) by (instance) / count(node_cpu_seconds_total{mode="system"}) by (instance)\</td>\</tr>\<tr>\<td>load5\</td>\<td>sum(node_load5) by (instance) / count(node_cpu_seconds_total{mode="system"}) by (instance)\</td>\</tr>\<tr>\<td>load15\</td>\<td>sum(node_load15) by (instance) / count(node_cpu_seconds_total{mode="system"}) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>load1\</td>\<td>sum(node_load1) by (instance) / count(node_cpu_seconds_total{mode="system"})\</td>\</tr>\<tr>\<td>load5\</td>\<td>sum(node_load5) by (instance) / count(node_cpu_seconds_total{mode="system"})\</td>\</tr>\<tr>\<td>load15\</td>\<td>sum(node_load15) by (instance) / count(node_cpu_seconds_total{mode="system"})\</td>\</tr>\</table>

Cluster Memory Utilization#

CatalogExpression
Detail1 - sum(node_memory_MemAvailable_bytes) by (instance) / sum(node_memory_MemTotal_bytes) by (instance)
Summary1 - sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)

Cluster Disk Utilization#

CatalogExpression
Detail(sum(node_filesystem_size_bytes{device!="rootfs"}) by (instance) - sum(node_filesystem_free_bytes{device!="rootfs"}) by (instance)) / sum(node_filesystem_size_bytes{device!="rootfs"}) by (instance)
Summary(sum(node_filesystem_size_bytes{device!="rootfs"}) - sum(node_filesystem_free_bytes{device!="rootfs"})) / sum(node_filesystem_size_bytes{device!="rootfs"})

Cluster Disk I/O#

CatalogExpression
Detail\<table>\<tr>\<td>read\</td>\<td>sum(rate(node_disk_read_bytes_total[5m])) by (instance)\</td>\</tr>\<tr>\<td>written\</td>\<td>sum(rate(node_disk_written_bytes_total[5m])) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>read\</td>\<td>sum(rate(node_disk_read_bytes_total[5m]))\</td>\</tr>\<tr>\<td>written\</td>\<td>sum(rate(node_disk_written_bytes_total[5m]))\</td>\</tr>\</table>

Cluster Network Packets#

CatalogExpression
Detail\<table>\<tr>\<td>receive-dropped\</td>\<td>\<code>sum(rate(node_network_receive_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\<tr>\<td>receive-errs\</td>\<td>\<code>sum(rate(node_network_receive_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\<tr>\<td>receive-packets\</td>\<td>\<code>sum(rate(node_network_receive_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>\<code>sum(rate(node_network_transmit_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\<tr>\<td>transmit-errs\</td>\<td>\<code>sum(rate(node_network_transmit_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>\<code>sum(rate(node_network_transmit_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive-dropped\</td>\<td>\<code>sum(rate(node_network_receive_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\<tr>\<td>receive-errs\</td>\<td>\<code>sum(rate(node_network_receive_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\<tr>\<td>receive-packets\</td>\<td>\<code>sum(rate(node_network_receive_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>\<code>sum(rate(node_network_transmit_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit-errs\</td>\<td>\<code>sum(rate(node_network_transmit_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>\<code>sum(rate(node_network_transmit_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\</table>

Cluster Network I/O#

CatalogExpression
Detail\<table>\<tr>\<td>receive\</td>\<td>\<code>sum(rate(node_network_receive_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\<tr>\<td>transmit\</td>\<td>\<code>sum(rate(node_network_transmit_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m])) by (instance)\</code>\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive\</td>\<td>\<code>sum(rate(node_network_receive_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit\</td>\<td>\<code>sum(rate(node_network_transmit_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr."}[5m]))\</code>\</td>\</tr>\</table>

Node Metrics

Node CPU Utilization#

CatalogExpression
Detailavg(irate(node_cpu_seconds_total{mode!="idle", instance=~"$instance"}[5m])) by (mode)
Summary1 - (avg(irate(node_cpu_seconds_total{mode="idle", instance=~"$instance"}[5m])))

Node Load Average#

CatalogExpression
Detail\<table>\<tr>\<td>load1\</td>\<td>sum(node_load1{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})\</td>\</tr>\<tr>\<td>load5\</td>\<td>sum(node_load5{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})\</td>\</tr>\<tr>\<td>load15\</td>\<td>sum(node_load15{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>load1\</td>\<td>sum(node_load1{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})\</td>\</tr>\<tr>\<td>load5\</td>\<td>sum(node_load5{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})\</td>\</tr>\<tr>\<td>load15\</td>\<td>sum(node_load15{instance=~"$instance"}) / count(node_cpu_seconds_total{mode="system",instance=~"$instance"})\</td>\</tr>\</table>

Node Memory Utilization#

CatalogExpression
Detail1 - sum(node_memory_MemAvailable_bytes{instance=~"$instance"}) / sum(node_memory_MemTotal_bytes{instance=~"$instance"})
Summary1 - sum(node_memory_MemAvailable_bytes{instance=~"$instance"}) / sum(node_memory_MemTotal_bytes{instance=~"$instance"})

Node Disk Utilization#

CatalogExpression
Detail(sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"}) by (device) - sum(node_filesystem_free_bytes{device!="rootfs",instance=~"$instance"}) by (device)) / sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"}) by (device)
Summary(sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"}) - sum(node_filesystem_free_bytes{device!="rootfs",instance=~"$instance"})) / sum(node_filesystem_size_bytes{device!="rootfs",instance=~"$instance"})

Node Disk I/O#

CatalogExpression
Detail\<table>\<tr>\<td>read\</td>\<td>sum(rate(node_disk_read_bytes_total{instance=~"$instance"}[5m]))\</td>\</tr>\<tr>\<td>written\</td>\<td>sum(rate(node_disk_written_bytes_total{instance=~"$instance"}[5m]))\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>read\</td>\<td>sum(rate(node_disk_read_bytes_total{instance=~"$instance"}[5m]))\</td>\</tr>\<tr>\<td>written\</td>\<td>sum(rate(node_disk_written_bytes_total{instance=~"$instance"}[5m]))\</td>\</tr>\</table>

Node Network Packets#

CatalogExpression
Detail\<table>\<tr>\<td>receive-dropped\</td>\<td>\<code>sum(rate(node_network_receive_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\<tr>\<td>receive-errs\</td>\<td>\<code>sum(rate(node_network_receive_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\<tr>\<td>receive-packets\</td>\<td>\<code>sum(rate(node_network_receive_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>\<code>sum(rate(node_network_transmit_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\<tr>\<td>transmit-errs\</td>\<td>\<code>sum(rate(node_network_transmit_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>\<code>sum(rate(node_network_transmit_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive-dropped\</td>\<td>\<code>sum(rate(node_network_receive_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\<tr>\<td>receive-errs\</td>\<td>\<code>sum(rate(node_network_receive_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\<tr>\<td>receive-packets\</td>\<td>\<code>sum(rate(node_network_receive_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>\<code>sum(rate(node_network_transmit_drop_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit-errs\</td>\<td>\<code>sum(rate(node_network_transmit_errs_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>\<code>sum(rate(node_network_transmit_packets_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\</table>

Node Network I/O#

CatalogExpression
Detail\<table>\<tr>\<td>receive\</td>\<td>\<code>sum(rate(node_network_receive_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\<tr>\<td>transmit\</td>\<td>\<code>sum(rate(node_network_transmit_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m])) by (device)\</code>\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive\</td>\<td>\<code>sum(rate(node_network_receive_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\<tr>\<td>transmit\</td>\<td>\<code>sum(rate(node_network_transmit_bytes_total{device!~"lo | veth. | docker. | flannel. | cali. | cbr.",instance=~"$instance"}[5m]))\</code>\</td>\</tr>\</table>

Etcd Metrics

Etcd Has a Leader#

max(etcd_server_has_leader)

Number of Times the Leader Changes#

max(etcd_server_leader_changes_seen_total)

Number of Failed Proposals#

sum(etcd_server_proposals_failed_total)

GRPC Client Traffic#

CatalogExpression
Detail\<table>\<tr>\<td>in\</td>\<td>sum(rate(etcd_network_client_grpc_received_bytes_total[5m])) by (instance)\</td>\</tr>\<tr>\<td>out\</td>\<td>sum(rate(etcd_network_client_grpc_sent_bytes_total[5m])) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>in\</td>\<td>sum(rate(etcd_network_client_grpc_received_bytes_total[5m]))\</td>\</tr>\<tr>\<td>out\</td>\<td>sum(rate(etcd_network_client_grpc_sent_bytes_total[5m]))\</td>\</tr>\</table>

Peer Traffic#

CatalogExpression
Detail\<table>\<tr>\<td>in\</td>\<td>sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)\</td>\</tr>\<tr>\<td>out\</td>\<td>sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>in\</td>\<td>sum(rate(etcd_network_peer_received_bytes_total[5m]))\</td>\</tr>\<tr>\<td>out\</td>\<td>sum(rate(etcd_network_peer_sent_bytes_total[5m]))\</td>\</tr>\</table>

DB Size#

CatalogExpression
Detailsum(etcd_debugging_mvcc_db_total_size_in_bytes) by (instance)
Summarysum(etcd_debugging_mvcc_db_total_size_in_bytes)

Active Streams#

CatalogExpression
Detail\<table>\<tr>\<td>lease-watch\</td>\<td>sum(grpc_server_started_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) by (instance) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) by (instance)\</td>\</tr>\<tr>\<td>watch\</td>\<td>sum(grpc_server_started_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) by (instance) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>lease-watch\</td>\<td>sum(grpc_server_started_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})\</td>\</tr>\<tr>\<td>watch\</td>\<td>sum(grpc_server_started_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})\</td>\</tr>\</table>

Raft Proposals#

CatalogExpression
Detail\<table>\<tr>\<td>applied\</td>\<td>sum(increase(etcd_server_proposals_applied_total[5m])) by (instance)\</td>\</tr>\<tr>\<td>committed\</td>\<td>sum(increase(etcd_server_proposals_committed_total[5m])) by (instance)\</td>\</tr>\<tr>\<td>pending\</td>\<td>sum(increase(etcd_server_proposals_pending[5m])) by (instance)\</td>\</tr>\<tr>\<td>failed\</td>\<td>sum(increase(etcd_server_proposals_failed_total[5m])) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>applied\</td>\<td>sum(increase(etcd_server_proposals_applied_total[5m]))\</td>\</tr>\<tr>\<td>committed\</td>\<td>sum(increase(etcd_server_proposals_committed_total[5m]))\</td>\</tr>\<tr>\<td>pending\</td>\<td>sum(increase(etcd_server_proposals_pending[5m]))\</td>\</tr>\<tr>\<td>failed\</td>\<td>sum(increase(etcd_server_proposals_failed_total[5m]))\</td>\</tr>\</table>

RPC Rate#

CatalogExpression
Detail\<table>\<tr>\<td>total\</td>\<td>sum(rate(grpc_server_started_total{grpc_type="unary"}[5m])) by (instance)\</td>\</tr>\<tr>\<td>fail\</td>\<td>sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[5m])) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>total\</td>\<td>sum(rate(grpc_server_started_total{grpc_type="unary"}[5m]))\</td>\</tr>\<tr>\<td>fail\</td>\<td>sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[5m]))\</td>\</tr>\</table>

Disk Operations#

CatalogExpression
Detail\<table>\<tr>\<td>commit-called-by-backend\</td>\<td>sum(rate(etcd_disk_backend_commit_duration_seconds_sum[1m])) by (instance)\</td>\</tr>\<tr>\<td>fsync-called-by-wal\</td>\<td>sum(rate(etcd_disk_wal_fsync_duration_seconds_sum[1m])) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>commit-called-by-backend\</td>\<td>sum(rate(etcd_disk_backend_commit_duration_seconds_sum[1m]))\</td>\</tr>\<tr>\<td>fsync-called-by-wal\</td>\<td>sum(rate(etcd_disk_wal_fsync_duration_seconds_sum[1m]))\</td>\</tr>\</table>

Disk Sync Duration#

CatalogExpression
Detail\<table>\<tr>\<td>wal\</td>\<td>histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))\</td>\</tr>\<tr>\<td>db\</td>\<td>histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>wal\</td>\<td>sum(histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)))\</td>\</tr>\<tr>\<td>db\</td>\<td>sum(histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)))\</td>\</tr>\</table>

Kubernetes Components Metrics

API Server Request Latency#

CatalogExpression
Detailavg(apiserver_request_latencies_sum / apiserver_request_latencies_count) by (instance, verb) /1e+06
Summaryavg(apiserver_request_latencies_sum / apiserver_request_latencies_count) by (instance) /1e+06

API Server Request Rate#

CatalogExpression
Detailsum(rate(apiserver_request_count[5m])) by (instance, code)
Summarysum(rate(apiserver_request_count[5m])) by (instance)

Scheduling Failed Pods#

CatalogExpression
Detailsum(kube_pod_status_scheduled{condition="false"})
Summarysum(kube_pod_status_scheduled{condition="false"})

Controller Manager Queue Depth#

CatalogExpression
Detail\<table>\<tr>\<td>volumes\</td>\<td>sum(volumes_depth) by instance\</td>\</tr>\<tr>\<td>deployment\</td>\<td>sum(deployment_depth) by instance\</td>\</tr>\<tr>\<td>replicaset\</td>\<td>sum(replicaset_depth) by instance\</td>\</tr>\<tr>\<td>service\</td>\<td>sum(service_depth) by instance\</td>\</tr>\<tr>\<td>serviceaccount\</td>\<td>sum(serviceaccount_depth) by instance\</td>\</tr>\<tr>\<td>endpoint\</td>\<td>sum(endpoint_depth) by instance\</td>\</tr>\<tr>\<td>daemonset\</td>\<td>sum(daemonset_depth) by instance\</td>\</tr>\<tr>\<td>statefulset\</td>\<td>sum(statefulset_depth) by instance\</td>\</tr>\<tr>\<td>replicationmanager\</td>\<td>sum(replicationmanager_depth) by instance\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>volumes\</td>\<td>sum(volumes_depth)\</td>\</tr>\<tr>\<td>deployment\</td>\<td>sum(deployment_depth)\</td>\</tr>\<tr>\<td>replicaset\</td>\<td>sum(replicaset_depth)\</td>\</tr>\<tr>\<td>service\</td>\<td>sum(service_depth)\</td>\</tr>\<tr>\<td>serviceaccount\</td>\<td>sum(serviceaccount_depth)\</td>\</tr>\<tr>\<td>endpoint\</td>\<td>sum(endpoint_depth)\</td>\</tr>\<tr>\<td>daemonset\</td>\<td>sum(daemonset_depth)\</td>\</tr>\<tr>\<td>statefulset\</td>\<td>sum(statefulset_depth)\</td>\</tr>\<tr>\<td>replicationmanager\</td>\<td>sum(replicationmanager_depth)\</td>\</tr>\</table>

Scheduler E2E Scheduling Latency#

CatalogExpression
Detailhistogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) by (le, instance)) / 1e+06
Summarysum(histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) by (le, instance)) / 1e+06)

Scheduler Preemption Attempts#

CatalogExpression
Detailsum(rate(scheduler_total_preemption_attempts[5m])) by (instance)
Summarysum(rate(scheduler_total_preemption_attempts[5m]))

Ingress Controller Connections#

CatalogExpression
Detail\<table>\<tr>\<td>reading\</td>\<td>sum(nginx_ingress_controller_nginx_process_connections{state="reading"}) by (instance)\</td>\</tr>\<tr>\<td>waiting\</td>\<td>sum(nginx_ingress_controller_nginx_process_connections{state="waiting"}) by (instance)\</td>\</tr>\<tr>\<td>writing\</td>\<td>sum(nginx_ingress_controller_nginx_process_connections{state="writing"}) by (instance)\</td>\</tr>\<tr>\<td>accepted\</td>\<td>sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="accepted"}[5m]))) by (instance)\</td>\</tr>\<tr>\<td>active\</td>\<td>sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="active"}[5m]))) by (instance)\</td>\</tr>\<tr>\<td>handled\</td>\<td>sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="handled"}[5m]))) by (instance)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>reading\</td>\<td>sum(nginx_ingress_controller_nginx_process_connections{state="reading"})\</td>\</tr>\<tr>\<td>waiting\</td>\<td>sum(nginx_ingress_controller_nginx_process_connections{state="waiting"})\</td>\</tr>\<tr>\<td>writing\</td>\<td>sum(nginx_ingress_controller_nginx_process_connections{state="writing"})\</td>\</tr>\<tr>\<td>accepted\</td>\<td>sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="accepted"}[5m])))\</td>\</tr>\<tr>\<td>active\</td>\<td>sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="active"}[5m])))\</td>\</tr>\<tr>\<td>handled\</td>\<td>sum(ceil(increase(nginx_ingress_controller_nginx_process_connections_total{state="handled"}[5m])))\</td>\</tr>\</table>

Ingress Controller Request Process Time#

CatalogExpression
Detailtopk(10, histogram_quantile(0.95,sum by (le, host, path)(rate(nginx_ingress_controller_request_duration_seconds_bucket{host!="_"}[5m]))))
Summarytopk(10, histogram_quantile(0.95,sum by (le, host)(rate(nginx_ingress_controller_request_duration_seconds_bucket{host!="_"}[5m]))))

Rancher Logging Metrics

Fluentd Buffer Queue Rate#

CatalogExpression
Detailsum(rate(fluentd_output_status_buffer_queue_length[5m])) by (instance)
Summarysum(rate(fluentd_output_status_buffer_queue_length[5m]))

Fluentd Input Rate#

CatalogExpression
Detailsum(rate(fluentd_input_status_num_records_total[5m])) by (instance)
Summarysum(rate(fluentd_input_status_num_records_total[5m]))

Fluentd Output Errors Rate#

CatalogExpression
Detailsum(rate(fluentd_output_status_num_errors[5m])) by (type)
Summarysum(rate(fluentd_output_status_num_errors[5m]))

Fluentd Output Rate#

CatalogExpression
Detailsum(rate(fluentd_output_status_num_records_total[5m])) by (instance)
Summarysum(rate(fluentd_output_status_num_records_total[5m]))

Workload Metrics

Workload CPU Utilization#

CatalogExpression
Detail\<table>\<tr>\<td>cfs throttled seconds\</td>\<td>sum(rate(container_cpu_cfs_throttled_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>user seconds\</td>\<td>sum(rate(container_cpu_user_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>system seconds\</td>\<td>sum(rate(container_cpu_system_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>usage seconds\</td>\<td>sum(rate(container_cpu_usage_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>cfs throttled seconds\</td>\<td>sum(rate(container_cpu_cfs_throttled_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>user seconds\</td>\<td>sum(rate(container_cpu_user_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>system seconds\</td>\<td>sum(rate(container_cpu_system_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>usage seconds\</td>\<td>sum(rate(container_cpu_usage_seconds_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\</table>

Workload Memory Utilization#

CatalogExpression
Detailsum(container_memory_working_set_bytes{namespace="$namespace",pod_name=~"$podName", container_name!=""}) by (pod_name)
Summarysum(container_memory_working_set_bytes{namespace="$namespace",pod_name=~"$podName", container_name!=""})

Workload Network Packets#

CatalogExpression
Detail\<table>\<tr>\<td>receive-packets\</td>\<td>sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>receive-dropped\</td>\<td>sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>receive-errors\</td>\<td>sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>transmit-errors\</td>\<td>sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive-packets\</td>\<td>sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>receive-dropped\</td>\<td>sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>receive-errors\</td>\<td>sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-errors\</td>\<td>sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\</table>

Workload Network I/O#

CatalogExpression
Detail\<table>\<tr>\<td>receive\</td>\<td>sum(rate(container_network_receive_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>transmit\</td>\<td>sum(rate(container_network_transmit_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive\</td>\<td>sum(rate(container_network_receive_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit\</td>\<td>sum(rate(container_network_transmit_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\</table>

Workload Disk I/O#

CatalogExpression
Detail\<table>\<tr>\<td>read\</td>\<td>sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\<tr>\<td>write\</td>\<td>sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m])) by (pod_name)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>read\</td>\<td>sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>write\</td>\<td>sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name=~"$podName",container_name!=""}[5m]))\</td>\</tr>\</table>

Pod Metrics

Pod CPU Utilization#

CatalogExpression
Detail\<table>\<tr>\<td>cfs throttled seconds\</td>\<td>sum(rate(container_cpu_cfs_throttled_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)\</td>\</tr>\<tr>\<td>usage seconds\</td>\<td>sum(rate(container_cpu_usage_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)\</td>\</tr>\<tr>\<td>system seconds\</td>\<td>sum(rate(container_cpu_system_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)\</td>\</tr>\<tr>\<td>user seconds\</td>\<td>sum(rate(container_cpu_user_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m])) by (container_name)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>cfs throttled seconds\</td>\<td>sum(rate(container_cpu_cfs_throttled_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>usage seconds\</td>\<td>sum(rate(container_cpu_usage_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>system seconds\</td>\<td>sum(rate(container_cpu_system_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>user seconds\</td>\<td>sum(rate(container_cpu_user_seconds_total{container_name!="POD",namespace="$namespace",pod_name="$podName", container_name!=""}[5m]))\</td>\</tr>\</table>

Pod Memory Utilization#

CatalogExpression
Detailsum(container_memory_working_set_bytes{container_name!="POD",namespace="$namespace",pod_name="$podName",container_name!=""}) by (container_name)
Summarysum(container_memory_working_set_bytes{container_name!="POD",namespace="$namespace",pod_name="$podName",container_name!=""})

Pod Network Packets#

CatalogExpression
Detail\<table>\<tr>\<td>receive-packets\</td>\<td>sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>receive-dropped\</td>\<td>sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>receive-errors\</td>\<td>sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-errors\</td>\<td>sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive-packets\</td>\<td>sum(rate(container_network_receive_packets_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>receive-dropped\</td>\<td>sum(rate(container_network_receive_packets_dropped_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>receive-errors\</td>\<td>sum(rate(container_network_receive_errors_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-packets\</td>\<td>sum(rate(container_network_transmit_packets_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-dropped\</td>\<td>sum(rate(container_network_transmit_packets_dropped_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit-errors\</td>\<td>sum(rate(container_network_transmit_errors_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\</table>

Pod Network I/O#

CatalogExpression
Detail\<table>\<tr>\<td>receive\</td>\<td>sum(rate(container_network_receive_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit\</td>\<td>sum(rate(container_network_transmit_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>receive\</td>\<td>sum(rate(container_network_receive_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>transmit\</td>\<td>sum(rate(container_network_transmit_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\</table>

Pod Disk I/O#

CatalogExpression
Detail\<table>\<tr>\<td>read\</td>\<td>sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m])) by (container_name)\</td>\</tr>\<tr>\<td>write\</td>\<td>sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m])) by (container_name)\</td>\</tr>\</table>
Summary\<table>\<tr>\<td>read\</td>\<td>sum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\<tr>\<td>write\</td>\<td>sum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name="$podName",container_name!=""}[5m]))\</td>\</tr>\</table>

Container Metrics

Container CPU Utilization#

CatalogExpression
cfs throttled secondssum(rate(container_cpu_cfs_throttled_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
usage secondssum(rate(container_cpu_usage_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
system secondssum(rate(container_cpu_system_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
user secondssum(rate(container_cpu_user_seconds_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))

Container Memory Utilization#

sum(container_memory_working_set_bytes{namespace="$namespace",pod_name="$podName",container_name="$containerName"})

Container Disk I/O#

CatalogExpression
readsum(rate(container_fs_reads_bytes_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))
writesum(rate(container_fs_writes_bytes_total{namespace="$namespace",pod_name="$podName",container_name="$containerName"}[5m]))