Compare commits

...

12 Commits

Author SHA1 Message Date
cc858dd8f3 Merge pull request 'chore(deps): update helm release argo-cd to v9.1.5' (#142) from renovate/argo-cd-9.x into main
Reviewed-on: #142
2025-11-30 20:28:29 +00:00
5d71a0f199 chore(deps): update helm release argo-cd to v9.1.5 2025-11-30 20:00:21 +00:00
a4d2f870d9 rm guestbook (the argo testing/example app) 2025-11-29 19:06:39 -05:00
7136a0f322 velero: add alert rules 2025-11-29 18:45:28 -05:00
c2d6c0c8bb velero: enable metrics and prometheusrule 2025-11-29 18:38:43 -05:00
f3c3741409 Merge pull request 'chore(deps): update helm release kube-prometheus-stack to v79.9.0' (#141) from renovate/kube-prometheus-stack-79.x into main
Reviewed-on: #141
2025-11-29 00:36:23 +00:00
80b7cb2282 chore(deps): update helm release kube-prometheus-stack to v79.9.0 2025-11-28 17:00:22 +00:00
bf66dd0818 Merge pull request 'chore(deps): update ghcr.io/siderolabs/discovery-service docker tag to v1.0.12' (#140) from renovate/ghcr.io-siderolabs-discovery-service-1.x into main
Reviewed-on: #140
2025-11-28 16:08:02 +00:00
eea1c80a27 rook-ceph: rm CephNodeDiskspaceWarning due to improper, non-ceph related alerts 2025-11-28 10:40:41 -05:00
612dd16d4b chore(deps): update ghcr.io/siderolabs/discovery-service docker tag to v1.0.12 2025-11-28 15:00:19 +00:00
341b402f0e Merge pull request 'chore(deps): update helm release kube-prometheus-stack to v79.8.2' (#139) from renovate/kube-prometheus-stack-79.x into main
Reviewed-on: #139
2025-11-28 01:08:18 +00:00
76eaa1dd98 chore(deps): update helm release kube-prometheus-stack to v79.8.2 2025-11-26 02:00:19 +00:00
8 changed files with 45 additions and 50 deletions

View File

@@ -2,7 +2,7 @@ version: "3.8"
services:
discovery:
restart: unless-stopped
image: ghcr.io/siderolabs/discovery-service:v1.0.11
image: ghcr.io/siderolabs/discovery-service:v1.0.12
ports:
- 10.105.6.215:3000:3000
- 10.105.6.215:3001:3001

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: argo-cd
version: 9.1.4
version: 9.1.5
repository: https://argoproj.github.io/argo-helm

View File

@@ -1,21 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: guestbook-ui
namespace: guestbook-ui
spec:
replicas: 1
revisionHistoryLimit: 3
selector:
matchLabels:
app: guestbook-ui
template:
metadata:
labels:
app: guestbook-ui
spec:
containers:
- image: gcr.io/heptio-images/ks-guestbook-demo:0.2
name: guestbook-ui
ports:
- containerPort: 80

View File

@@ -1,4 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
name: guestbook-ui

View File

@@ -1,11 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: guestbook-ui
namespace: guestbook-ui
spec:
ports:
- port: 80
targetPort: 80
selector:
app: guestbook-ui

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: kube-prometheus-stack
version: 79.8.1
version: 79.9.0
repository: https://prometheus-community.github.io/helm-charts

View File

@@ -544,15 +544,6 @@ spec:
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephNodeDiskspaceWarning"
annotations:
description: "Mountpoint {{ "{{" }} $labels.mountpoint {{ "}}" }} on {{ "{{" }} $labels.nodename {{ "}}" }} will be full in less than 5 days based on the 48 hour trailing fill rate."
summary: "Host filesystem free space is getting low"
expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
severity: "warning"
type: "ceph_default"
- alert: "CephNodeInconsistentMTU"
annotations:
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has a different MTU size ({{ "{{" }} $value {{ "}}" }}) than the median of devices named {{ "{{" }} $labels.device {{ "}}" }}."

View File

@@ -2,8 +2,48 @@ velero:
backupsEnabled: true
snapshotsEnabled: false
metrics:
labels:
metrics_enabled: "true"
serviceMonitor:
enabled: true
prometheusRule:
enabled: true
spec:
- alert: VeleroBackupFailed
annotations:
message: Velero backup {{ $labels.schedule }} has failed
expr: |-
velero_backup_last_status{schedule!=""} != 1
for: 15m
labels:
severity: warning
- alert: VeleroBackupFailing
annotations:
message: Velero backup {{ $labels.schedule }} has been failing for the last 12h
expr: |-
velero_backup_last_status{schedule!=""} != 1
for: 12h
labels:
severity: critical
- alert: VeleroNoNewBackup
annotations:
message: Velero backup {{ $labels.schedule }} has not run successfully in the last 25h
expr: |-
(
(time() - velero_backup_last_successful_timestamp{schedule!=""}) >bool (25 * 3600)
or
absent(velero_backup_last_successful_timestamp{schedule!=""})
) == 1
for: 1h
labels:
severity: critical
- alert: VeleroBackupPartialFailures
annotations:
message: Velero backup {{ $labels.schedule }} has {{ $value | humanizePercentage }} partialy failed backups
expr: |-
rate(velero_backup_partial_failure_total{schedule!=""}[25m])
/ rate(velero_backup_attempt_total{schedule!=""}[25m]) > 0.5
for: 15m
labels:
severity: warning
configuration:
backupStorageLocation:
- name: weyma-truenas