remove redundant node alerts

This commit is contained in:
2025-12-30 23:50:18 -05:00
parent c26ea4e139
commit 6f2603d3a0

View File

@@ -497,61 +497,6 @@ spec:
oid: "1.3.6.1.4.1.50495.1.2.1.8.1"
severity: "critical"
type: "ceph_default"
- alert: "CephNodeNetworkPacketDrops"
annotations:
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ "{{" }} $labels.device {{ "}}" }}."
summary: "One or more NICs reports packet drops"
expr: |
(
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0050000000000000001 and (
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
severity: "warning"
type: "ceph_default"
- alert: "CephNodeNetworkPacketErrors"
annotations:
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ "{{" }} $labels.device {{ "}}" }}."
summary: "One or more NICs reports packet errors"
expr: |
(
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
severity: "warning"
type: "ceph_default"
- alert: "CephNodeNetworkBondDegraded"
annotations:
description: "Bond {{ "{{" }} $labels.master {{ "}}" }} is degraded on Node {{ "{{" }} $labels.instance {{ "}}" }}."
summary: "Degraded Bond on Node {{ "{{" }} $labels.instance {{ "}}" }}"
expr: |
node_bonding_slaves - node_bonding_active != 0
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephNodeInconsistentMTU"
annotations:
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has a different MTU size ({{ "{{" }} $value {{ "}}" }}) than the median of devices named {{ "{{" }} $labels.device {{ "}}" }}."
summary: "MTU settings across Ceph hosts are inconsistent"
expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )"
labels:
severity: "warning"
type: "ceph_default"
- name: "pools"
rules:
- alert: "CephPoolGrowthWarning"