Compare commits

...

20 Commits

Author SHA1 Message Date
cc858dd8f3 Merge pull request 'chore(deps): update helm release argo-cd to v9.1.5' (#142) from renovate/argo-cd-9.x into main
Reviewed-on: #142
2025-11-30 20:28:29 +00:00
5d71a0f199 chore(deps): update helm release argo-cd to v9.1.5 2025-11-30 20:00:21 +00:00
a4d2f870d9 rm guestbook (the argo testing/example app) 2025-11-29 19:06:39 -05:00
7136a0f322 velero: add alert rules 2025-11-29 18:45:28 -05:00
c2d6c0c8bb velero: enable metrics and prometheusrule 2025-11-29 18:38:43 -05:00
f3c3741409 Merge pull request 'chore(deps): update helm release kube-prometheus-stack to v79.9.0' (#141) from renovate/kube-prometheus-stack-79.x into main
Reviewed-on: #141
2025-11-29 00:36:23 +00:00
80b7cb2282 chore(deps): update helm release kube-prometheus-stack to v79.9.0 2025-11-28 17:00:22 +00:00
bf66dd0818 Merge pull request 'chore(deps): update ghcr.io/siderolabs/discovery-service docker tag to v1.0.12' (#140) from renovate/ghcr.io-siderolabs-discovery-service-1.x into main
Reviewed-on: #140
2025-11-28 16:08:02 +00:00
eea1c80a27 rook-ceph: rm CephNodeDiskspaceWarning due to improper, non-ceph related alerts 2025-11-28 10:40:41 -05:00
612dd16d4b chore(deps): update ghcr.io/siderolabs/discovery-service docker tag to v1.0.12 2025-11-28 15:00:19 +00:00
341b402f0e Merge pull request 'chore(deps): update helm release kube-prometheus-stack to v79.8.2' (#139) from renovate/kube-prometheus-stack-79.x into main
Reviewed-on: #139
2025-11-28 01:08:18 +00:00
76eaa1dd98 chore(deps): update helm release kube-prometheus-stack to v79.8.2 2025-11-26 02:00:19 +00:00
a730f43cbd Merge pull request 'chore(deps): update helm release kube-prometheus-stack to v79.8.1' (#138) from renovate/kube-prometheus-stack-79.x into main
Reviewed-on: #138
2025-11-26 00:18:05 +00:00
4bd23be552 chore(deps): update helm release kube-prometheus-stack to v79.8.1 2025-11-25 21:00:19 +00:00
6cd4b20970 metallb: no rbacPrometheus 2025-11-24 22:45:00 -05:00
c3c66cb9e3 metallb: fix values 2025-11-24 22:43:52 -05:00
b0fb79f7ea traefik: fix prometheusrule 2025-11-24 22:39:00 -05:00
624c5c7a8c traefik: enable monitoring 2025-11-24 22:33:08 -05:00
ebf8f25342 metallb: enable prometheusrules and servicemonitors 2025-11-24 22:31:16 -05:00
87c5d94e0d external-secrets: enable monitoring 2025-11-24 22:29:47 -05:00
11 changed files with 66 additions and 406 deletions

View File

@@ -2,7 +2,7 @@ version: "3.8"
services: services:
discovery: discovery:
restart: unless-stopped restart: unless-stopped
image: ghcr.io/siderolabs/discovery-service:v1.0.11 image: ghcr.io/siderolabs/discovery-service:v1.0.12
ports: ports:
- 10.105.6.215:3000:3000 - 10.105.6.215:3000:3000
- 10.105.6.215:3001:3001 - 10.105.6.215:3001:3001

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies: dependencies:
- name: argo-cd - name: argo-cd
version: 9.1.4 version: 9.1.5
repository: https://argoproj.github.io/argo-helm repository: https://argoproj.github.io/argo-helm

View File

@@ -171,7 +171,7 @@ resources: {}
serviceMonitor: serviceMonitor:
# -- Specifies whether to create a ServiceMonitor resource for collecting Prometheus metrics # -- Specifies whether to create a ServiceMonitor resource for collecting Prometheus metrics
enabled: false enabled: true
# -- namespace where you want to install ServiceMonitors # -- namespace where you want to install ServiceMonitors
namespace: "" namespace: ""

View File

@@ -1,21 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: guestbook-ui
namespace: guestbook-ui
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: guestbook-ui
template:
metadata:
labels:
app: guestbook-ui
spec:
containers:
- image: gcr.io/heptio-images/ks-guestbook-demo:0.2
name: guestbook-ui
ports:
- containerPort: 80

View File

@@ -1,4 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
name: guestbook-ui

View File

@@ -1,11 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: guestbook-ui
namespace: guestbook-ui
spec:
ports:
- port: 80
targetPort: 80
selector:
app: guestbook-ui

View File

@@ -1,354 +1,7 @@
# Default values for metallb. metallb:
# This is a YAML-formatted file. prometheus:
# Declare variables to be passed into your templates. rbacPrometheus: false
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
loadBalancerClass: ""
# To configure MetalLB, you must specify ONE of the following two
# options.
rbac:
# create specifies whether to install and use RBAC rules.
create: true
prometheus:
# scrape annotations specifies whether to add Prometheus metric
# auto-collection annotations to pods. See
# https://github.com/prometheus/prometheus/blob/release-2.1/documentation/examples/prometheus-kubernetes.yml
# for a corresponding Prometheus configuration. Alternatively, you
# may want to use the Prometheus Operator
# (https://github.com/coreos/prometheus-operator) for more powerful
# monitoring configuration. If you use the Prometheus operator, this
# can be left at false.
scrapeAnnotations: false
# port both controller and speaker will listen on for metrics
metricsPort: 7472
# if set, enables rbac proxy on the controller and speaker to expose
# the metrics via tls.
# secureMetricsPort: 9120
# the name of the secret to be mounted in the speaker pod
# to expose the metrics securely. If not present, a self signed
# certificate to be used.
speakerMetricsTLSSecret: ""
# the name of the secret to be mounted in the controller pod
# to expose the metrics securely. If not present, a self signed
# certificate to be used.
controllerMetricsTLSSecret: ""
# prometheus doesn't have the permission to scrape all namespaces so we give it permission to scrape metallb's one
rbacPrometheus: true
# the service account used by prometheus
# required when " .Values.prometheus.rbacPrometheus == true " and " .Values.prometheus.podMonitor.enabled=true or prometheus.serviceMonitor.enabled=true "
serviceAccount: ""
# the namespace where prometheus is deployed
# required when " .Values.prometheus.rbacPrometheus == true " and " .Values.prometheus.podMonitor.enabled=true or prometheus.serviceMonitor.enabled=true "
namespace: ""
# the image to be used for the kuberbacproxy container
rbacProxy:
repository: gcr.io/kubebuilder/kube-rbac-proxy
tag: v0.12.0
pullPolicy:
# Prometheus Operator PodMonitors
podMonitor: podMonitor:
# enable support for Prometheus Operator enabled: true
enabled: false
# optional additional labels for podMonitors
additionalLabels: {}
# optional annotations for podMonitors
annotations: {}
# Job label for scrape target
jobLabel: "app.kubernetes.io/name"
# Scrape interval. If not set, the Prometheus default scrape interval is used.
interval:
# metric relabel configs to apply to samples before ingestion.
metricRelabelings: []
# - action: keep
# regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
# sourceLabels: [__name__]
# relabel configs to apply to samples before ingestion.
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# target_label: nodename
# replacement: $1
# action: replace
# Prometheus Operator ServiceMonitors. To be used as an alternative
# to podMonitor, supports secure metrics.
serviceMonitor:
# enable support for Prometheus Operator
enabled: false
speaker:
# optional additional labels for the speaker serviceMonitor
additionalLabels: {}
# optional additional annotations for the speaker serviceMonitor
annotations: {}
# optional tls configuration for the speaker serviceMonitor, in case
# secure metrics are enabled.
tlsConfig:
insecureSkipVerify: true
controller:
# optional additional labels for the controller serviceMonitor
additionalLabels: {}
# optional additional annotations for the controller serviceMonitor
annotations: {}
# optional tls configuration for the controller serviceMonitor, in case
# secure metrics are enabled.
tlsConfig:
insecureSkipVerify: true
# Job label for scrape target
jobLabel: "app.kubernetes.io/name"
# Scrape interval. If not set, the Prometheus default scrape interval is used.
interval:
# metric relabel configs to apply to samples before ingestion.
metricRelabelings: []
# - action: keep
# regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
# sourceLabels: [__name__]
# relabel configs to apply to samples before ingestion.
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# target_label: nodename
# replacement: $1
# action: replace
# Prometheus Operator alertmanager alerts
prometheusRule: prometheusRule:
# enable alertmanager alerts
enabled: false
# optional additional labels for prometheusRules
additionalLabels: {}
# optional annotations for prometheusRules
annotations: {}
# MetalLBStaleConfig
staleConfig:
enabled: true enabled: true
labels:
severity: warning
# MetalLBConfigNotLoaded
configNotLoaded:
enabled: true
labels:
severity: warning
# MetalLBAddressPoolExhausted
addressPoolExhausted:
enabled: true
labels:
severity: critical
addressPoolUsage:
enabled: true
thresholds:
- percent: 75
labels:
severity: warning
- percent: 85
labels:
severity: warning
- percent: 95
labels:
severity: critical
# MetalLBBGPSessionDown
bgpSessionDown:
enabled: true
labels:
severity: critical
extraAlerts: []
# controller contains configuration specific to the MetalLB cluster
# controller.
controller:
enabled: true
# -- Controller log level. Must be one of: `all`, `debug`, `info`, `warn`, `error` or `none`
logLevel: info
# command: /controller
# webhookMode: enabled
## @param controller.updateStrategy.type Metallb controller deployment strategy type.
## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy
## e.g:
## strategy:
## type: RollingUpdate
## rollingUpdate:
## maxSurge: 25%
## maxUnavailable: 25%
##
strategy:
type: RollingUpdate
serviceAccount:
# Specifies whether a ServiceAccount should be created
create: true
# The name of the ServiceAccount to use. If not set and create is
# true, a name is generated using the fullname template
name: ""
annotations: {}
securityContext:
runAsNonRoot: true
# nobody
runAsUser: 65534
fsGroup: 65534
resources: {}
# limits:
# cpu: 100m
# memory: 100Mi
nodeSelector: {}
tolerations: []
priorityClassName: ""
runtimeClassName: ""
affinity: {}
podAnnotations: {}
labels: {}
livenessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
tlsMinVersion: "VersionTLS12"
tlsCipherSuites: ""
extraContainers: []
# speaker contains configuration specific to the MetalLB speaker
# daemonset.
speaker:
enabled: true
# command: /speaker
# -- Speaker log level. Must be one of: `all`, `debug`, `info`, `warn`, `error` or `none`
logLevel: info
tolerateMaster: true
memberlist:
enabled: true
mlBindPort: 7946
mlBindAddrOverride: ""
mlSecretKeyPath: "/etc/ml_secret_key"
excludeInterfaces:
enabled: true
# ignore the exclude-from-external-loadbalancer label
ignoreExcludeLB: false
## @param speaker.updateStrategy.type Speaker daemonset strategy type
## ref: https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/
##
updateStrategy:
## StrategyType
## Can be set to RollingUpdate or OnDelete
##
type: RollingUpdate
serviceAccount:
# Specifies whether a ServiceAccount should be created
create: true
# The name of the ServiceAccount to use. If not set and create is
# true, a name is generated using the fullname template
name: ""
annotations: {}
securityContext: {}
## Defines a secret name for the controller to generate a memberlist encryption secret
## By default secretName: {{ "metallb.fullname" }}-memberlist
##
# secretName:
resources: {}
# limits:
# cpu: 100m
# memory: 100Mi
nodeSelector: {}
tolerations: []
priorityClassName: ""
affinity: {}
## Selects which runtime class will be used by the pod.
runtimeClassName: ""
podAnnotations: {}
labels: {}
livenessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
startupProbe:
enabled: true
failureThreshold: 30
periodSeconds: 5
# frr contains configuration specific to the MetalLB FRR container,
# for speaker running alongside FRR.
frr:
enabled: false
metricsPort: 7473
resources: {}
# if set, enables a rbac proxy sidecar container on the speaker to
# expose the frr metrics via tls.
# secureMetricsPort: 9121
reloader:
resources: {}
frrMetrics:
resources: {}
extraContainers: []
crds:
enabled: true
validationFailurePolicy: Fail
# frrk8s contains the configuration related to using an frrk8s instance
# (github.com/metallb/frr-k8s) as the backend for the BGP implementation.
# This allows configuring additional frr parameters in combination to those
# applied by MetalLB.
frrk8s:
# if set, enables frrk8s as a backend. This is mutually exclusive to frr
# mode.
enabled: false
external: false
namespace: ""

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies: dependencies:
- name: kube-prometheus-stack - name: kube-prometheus-stack
version: 79.7.1 version: 79.9.0
repository: https://prometheus-community.github.io/helm-charts repository: https://prometheus-community.github.io/helm-charts

View File

@@ -544,15 +544,6 @@ spec:
labels: labels:
severity: "warning" severity: "warning"
type: "ceph_default" type: "ceph_default"
- alert: "CephNodeDiskspaceWarning"
annotations:
description: "Mountpoint {{ "{{" }} $labels.mountpoint {{ "}}" }} on {{ "{{" }} $labels.nodename {{ "}}" }} will be full in less than 5 days based on the 48 hour trailing fill rate."
summary: "Host filesystem free space is getting low"
expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
severity: "warning"
type: "ceph_default"
- alert: "CephNodeInconsistentMTU" - alert: "CephNodeInconsistentMTU"
annotations: annotations:
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has a different MTU size ({{ "{{" }} $value {{ "}}" }}) than the median of devices named {{ "{{" }} $labels.device {{ "}}" }}." description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has a different MTU size ({{ "{{" }} $value {{ "}}" }}) than the median of devices named {{ "{{" }} $labels.device {{ "}}" }}."

View File

@@ -20,8 +20,20 @@ traefik:
prometheus: prometheus:
service: service:
enabled: true enabled: true
serviceMonitor:
enabled: true
prometheusRule:
enabled: true
rules:
- alert: TraefikDown
expr: up{job="traefik"} == 0
for: 5m
labels: labels:
metrics_enabled: "true" context: traefik
severity: warning
annotations:
summary: "Traefik Down"
description: "{{ $labels.pod }} on {{ $labels.nodename }} is down"
deployment: deployment:
kind: DaemonSet kind: DaemonSet
additionalContainers: additionalContainers:

View File

@@ -2,8 +2,48 @@ velero:
backupsEnabled: true backupsEnabled: true
snapshotsEnabled: false snapshotsEnabled: false
metrics: metrics:
serviceMonitor:
enabled: true
prometheusRule:
enabled: true
spec:
- alert: VeleroBackupFailed
annotations:
message: Velero backup {{ $labels.schedule }} has failed
expr: |-
velero_backup_last_status{schedule!=""} != 1
for: 15m
labels: labels:
metrics_enabled: "true" severity: warning
- alert: VeleroBackupFailing
annotations:
message: Velero backup {{ $labels.schedule }} has been failing for the last 12h
expr: |-
velero_backup_last_status{schedule!=""} != 1
for: 12h
labels:
severity: critical
- alert: VeleroNoNewBackup
annotations:
message: Velero backup {{ $labels.schedule }} has not run successfully in the last 25h
expr: |-
(
(time() - velero_backup_last_successful_timestamp{schedule!=""}) >bool (25 * 3600)
or
absent(velero_backup_last_successful_timestamp{schedule!=""})
) == 1
for: 1h
labels:
severity: critical
- alert: VeleroBackupPartialFailures
annotations:
message: Velero backup {{ $labels.schedule }} has {{ $value | humanizePercentage }} partialy failed backups
expr: |-
rate(velero_backup_partial_failure_total{schedule!=""}[25m])
/ rate(velero_backup_attempt_total{schedule!=""}[25m]) > 0.5
for: 15m
labels:
severity: warning
configuration: configuration:
backupStorageLocation: backupStorageLocation:
- name: weyma-truenas - name: weyma-truenas