Compare commits
143 Commits
7237e23151
...
renovate/a
| Author | SHA1 | Date | |
|---|---|---|---|
|
e2cf802ae4
|
|||
|
99ec607e6d
|
|||
| 96424b124c | |||
|
96937cd358
|
|||
|
373823e565
|
|||
|
d36dd7735f
|
|||
|
1a0aeb0e64
|
|||
|
e6e63b5b2c
|
|||
|
0fcb071122
|
|||
|
e0f4fc71af
|
|||
|
e747bbe519
|
|||
| 067c3cbc59 | |||
| 27fcdd6bac | |||
|
67a7c32675
|
|||
|
c352c07f7b
|
|||
|
3397d80865
|
|||
|
39548b9b31
|
|||
|
9b75b8d4bf
|
|||
|
8d29dd8bd1
|
|||
| 4090830d95 | |||
|
21790a5a41
|
|||
|
4ab5ecdd6f
|
|||
|
c11f7897d7
|
|||
|
8839dd6eb1
|
|||
|
cc57178974
|
|||
| 3d95158244 | |||
|
141f05c6ae
|
|||
|
3651f23c72
|
|||
|
b4cbbd97a6
|
|||
| ede26d9c1d | |||
|
cc14ef66ed
|
|||
|
35b3f6cc42
|
|||
|
033a3b95ad
|
|||
|
f90060e366
|
|||
| 1f074a7087 | |||
|
c6cf3b7d84
|
|||
|
e611c68342
|
|||
|
d828d88078
|
|||
|
8fa00efc16
|
|||
|
fbe2274182
|
|||
|
bf4985040a
|
|||
|
ce3a367ec9
|
|||
|
6006e75db9
|
|||
| db590d1d2c | |||
|
7b6f92646f
|
|||
|
6bbd9748a2
|
|||
|
b22ff17c1d
|
|||
|
96900bea0c
|
|||
|
0f84c335de
|
|||
|
5e1b5dc007
|
|||
| f0f1b45c93 | |||
| 46c4e7b50f | |||
|
dbba05d7b6
|
|||
|
db9aa7c99d
|
|||
|
8f0d73946f
|
|||
|
d0ac6145e0
|
|||
|
b9830a2153
|
|||
|
4f51cc5799
|
|||
|
fe1707d078
|
|||
| df154d3b8b | |||
| 49d6684d0a | |||
|
179cdaffd7
|
|||
|
396c998336
|
|||
|
1829d76a07
|
|||
|
4315074427
|
|||
|
289a51fd7d
|
|||
|
b6f178ef88
|
|||
| eb021c1510 | |||
|
99e7e0ae30
|
|||
|
e80fb62fd7
|
|||
|
b6cf261505
|
|||
| cc2b1825d5 | |||
|
5b15d78da0
|
|||
|
3e54d7c96d
|
|||
|
52d680a143
|
|||
|
a5a604a496
|
|||
|
9194de2325
|
|||
| ce5a5c63e3 | |||
|
6a4c3e2253
|
|||
|
65013f6720
|
|||
|
9426dbeb71
|
|||
|
ca3234cb79
|
|||
| 6d2d895b67 | |||
| c2bd9b23ac | |||
|
0790ccd2ad
|
|||
|
ddcadddaaa
|
|||
|
8fabc526ad
|
|||
|
e285b581f3
|
|||
|
3f614405c8
|
|||
|
079fdd4da2
|
|||
| 6e22223c4b | |||
| 61d5ad7071 | |||
| cd3f663549 | |||
|
f1fd7c6cb1
|
|||
|
175b2c13f9
|
|||
|
a643de1085
|
|||
| e5aab6948d | |||
|
30456b3817
|
|||
|
ab12531084
|
|||
|
b789b7be21
|
|||
|
3a2cfdb84e
|
|||
|
506c034948
|
|||
|
7cbc80906e
|
|||
|
3fd705520c
|
|||
|
94d65decd1
|
|||
|
e06a1be194
|
|||
|
dc926c31de
|
|||
|
af31507e8c
|
|||
|
c0ca549393
|
|||
|
a113c84c9d
|
|||
|
a7cc46ed8a
|
|||
|
54e6a76aab
|
|||
|
33ef2866e9
|
|||
|
b609e87dd3
|
|||
|
e1ffafc161
|
|||
| 4170dfa26c | |||
|
5fcb92ee8b
|
|||
|
c5acc2416f
|
|||
|
87b667b2ab
|
|||
| d68d2db3bc | |||
| ad68a17eb5 | |||
|
b07c7bf3a0
|
|||
|
78fc45ae6c
|
|||
| 2fa1594e99 | |||
| b211327516 | |||
|
6885ec790c
|
|||
|
664cace62e
|
|||
|
dae06b2c05
|
|||
|
583831273d
|
|||
|
f327b23001
|
|||
|
6f2603d3a0
|
|||
| c26ea4e139 | |||
| b521924f00 | |||
|
19f203e374
|
|||
|
bb251462fb
|
|||
|
9a9d108e7c
|
|||
|
70d5ae2e48
|
|||
| e6e25baee1 | |||
| a08e9930d5 | |||
|
94bb98b4ed
|
|||
|
07f863b0a7
|
|||
| 79669aaf16 | |||
|
72e16276b8
|
37
README.md
Normal file
37
README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Main Infrastructure: weyma-talos
|
||||
|
||||
**Production Kubernetes infrastructure with disaster recovery capabilities**
|
||||
|
||||
This repository contains the foundational infrastructure for my Kubernetes homelab, designed with reliability and rapid recovery as core principles.
|
||||
|
||||
## Architecture
|
||||
|
||||
My infrastructure follows a layered "black start" approach - essential services run outside the Kubernetes cluster to enable cluster bootstrapping and recovery from total failures.
|
||||
|
||||
### Black Start Layer
|
||||
Static services (Docker Compose on TrueNAS/Proxmox) that provide cluster dependencies:
|
||||
- Image cache for faster deployments and offline capability
|
||||
- Talos discovery server for node bootstrapping
|
||||
- HashiCorp Vault for secrets management (external to cluster)
|
||||
- Future: Self-hosted Sidero Omni server (migrating from SaaS)
|
||||
|
||||
### System Apps Layer
|
||||
Applications running within Kubernetes that provide core cluster functionality, managed via ArgoCD with GitOps principles.
|
||||
|
||||
## Repository Structure
|
||||
|
||||
- **`black-start/`** - Docker Compose services for cluster dependencies
|
||||
- **`config-patches/`** - Talos Linux configuration patches for cluster and individual machines
|
||||
- **`omni/`** - Sidero Omni [cluster template](https://docs.siderolabs.com/omni/reference/cluster-templates)
|
||||
- **`system-apps/`** - System applications (ArgoCD projects) - monitoring, ingress, certificates, storage
|
||||
|
||||
## Tech Stack
|
||||
|
||||
**OS:** Talos Linux | **Orchestration:** Kubernetes | **GitOps:** ArgoCD | **Secrets:** Vault | **Storage:** Rook-Ceph
|
||||
|
||||
## Recovery Process
|
||||
|
||||
The "black start" architecture enables ~15-20 minute automated recovery from complete infrastructure failure:
|
||||
1. Start black-start services → 2. Bootstrap Talos → 3. Deploy system apps → 4. Deploy core apps
|
||||
|
||||
For application deployments, see [core-apps](https://git.dubyatp.xyz/core-apps).
|
||||
@@ -2,7 +2,7 @@ version: "3.8"
|
||||
services:
|
||||
discovery:
|
||||
restart: unless-stopped
|
||||
image: ghcr.io/siderolabs/discovery-service:v1.0.13
|
||||
image: ghcr.io/siderolabs/discovery-service:v1.0.15
|
||||
ports:
|
||||
- 10.105.6.215:3000:3000
|
||||
- 10.105.6.215:3001:3001
|
||||
@@ -52,6 +52,7 @@ patches:
|
||||
bind-address: 0.0.0.0
|
||||
proxy:
|
||||
extraArgs:
|
||||
proxy-mode: ipvs
|
||||
metrics-bind-address: 0.0.0.0:10249
|
||||
scheduler:
|
||||
extraArgs:
|
||||
@@ -287,12 +288,51 @@ patches:
|
||||
selector:
|
||||
k8s-app: metrics-server
|
||||
name: metrics-lb
|
||||
- contents: |-
|
||||
apiVersion: v1
|
||||
data:
|
||||
Corefile: |
|
||||
.:53 {
|
||||
errors
|
||||
health {
|
||||
lameduck 5s
|
||||
}
|
||||
ready
|
||||
log . {
|
||||
class error
|
||||
}
|
||||
prometheus :9153
|
||||
|
||||
kubernetes cluster.local in-addr.arpa ip6.arpa {
|
||||
pods insecure
|
||||
fallthrough in-addr.arpa ip6.arpa
|
||||
ttl 30
|
||||
}
|
||||
|
||||
rewrite name git.dubyatp.xyz traefik-local.traefik.svc.cluster.local
|
||||
|
||||
forward . /etc/resolv.conf {
|
||||
max_concurrent 1000
|
||||
}
|
||||
cache 30 {
|
||||
disable success cluster.local
|
||||
disable denial cluster.local
|
||||
}
|
||||
loop
|
||||
reload
|
||||
loadbalance
|
||||
}
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: coredns
|
||||
namespace: kube-system
|
||||
name: coredns-config
|
||||
---
|
||||
kind: ControlPlane
|
||||
machines:
|
||||
- 20b4c826-e699-43b3-826d-73eb5173680b
|
||||
- 30303031-3030-3030-6335-303731636665
|
||||
- 5fdea709-56ad-45f2-966d-5e344dbe4fdf
|
||||
- 30303031-3030-3030-6335-303731636665
|
||||
---
|
||||
kind: Workers
|
||||
machines:
|
||||
@@ -306,6 +346,8 @@ kind: Machine
|
||||
name: 02c02200-f403-11ef-9372-70f446672600
|
||||
patches:
|
||||
- idOverride: 400-cm-02c02200-f403-11ef-9372-70f446672600
|
||||
annotations:
|
||||
name: ""
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
@@ -329,14 +371,11 @@ patches:
|
||||
interface: br0
|
||||
---
|
||||
kind: Machine
|
||||
systemExtensions:
|
||||
- siderolabs/i915
|
||||
- siderolabs/nut-client
|
||||
name: 03000200-0400-0500-0006-000700080009
|
||||
install:
|
||||
disk: /dev/sda
|
||||
patches:
|
||||
- idOverride: 400-cm-03000200-0400-0500-0006-000700080009
|
||||
annotations:
|
||||
name: ""
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
@@ -363,6 +402,8 @@ kind: Machine
|
||||
name: 1006b91a-ecbf-11ea-aed4-046ba1ee3700
|
||||
patches:
|
||||
- idOverride: 400-cm-1006b91a-ecbf-11ea-aed4-046ba1ee3700
|
||||
annotations:
|
||||
name: ""
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
@@ -386,22 +427,6 @@ patches:
|
||||
interface: br0
|
||||
---
|
||||
kind: Machine
|
||||
name: 20b4c826-e699-43b3-826d-73eb5173680b
|
||||
patches:
|
||||
- idOverride: 400-cm-20b4c826-e699-43b3-826d-73eb5173680b
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
hostname: weyma-talos-cp02
|
||||
interfaces:
|
||||
- deviceSelector:
|
||||
driver: virtio*
|
||||
hardwareAddr: 00:16:3e:9c:01:27
|
||||
dhcp: true
|
||||
---
|
||||
kind: Machine
|
||||
systemExtensions:
|
||||
- siderolabs/nut-client
|
||||
name: 30303031-3030-3030-6335-303731636665
|
||||
patches:
|
||||
- idOverride: 400-cm-30303031-3030-3030-6335-303731636665
|
||||
@@ -415,9 +440,27 @@ patches:
|
||||
dhcp: true
|
||||
---
|
||||
kind: Machine
|
||||
name: 20b4c826-e699-43b3-826d-73eb5173680b
|
||||
patches:
|
||||
- idOverride: 400-cm-20b4c826-e699-43b3-826d-73eb5173680b
|
||||
annotations:
|
||||
name: ""
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
hostname: weyma-talos-cp02
|
||||
interfaces:
|
||||
- deviceSelector:
|
||||
driver: virtio*
|
||||
hardwareAddr: 00:16:3e:9c:01:27
|
||||
dhcp: true
|
||||
---
|
||||
kind: Machine
|
||||
name: 5f0cd701-0784-4fcc-8e52-3b3304049972
|
||||
patches:
|
||||
- idOverride: 400-cm-5f0cd701-0784-4fcc-8e52-3b3304049972
|
||||
annotations:
|
||||
name: ""
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
@@ -446,6 +489,8 @@ systemExtensions:
|
||||
name: 5fdea709-56ad-45f2-966d-5e344dbe4fdf
|
||||
patches:
|
||||
- idOverride: 400-cm-5fdea709-56ad-45f2-966d-5e344dbe4fdf
|
||||
annotations:
|
||||
name: ""
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
@@ -460,6 +505,8 @@ kind: Machine
|
||||
name: da507021-8912-4337-86a3-94a05bd1cf05
|
||||
patches:
|
||||
- idOverride: 400-cm-da507021-8912-4337-86a3-94a05bd1cf05
|
||||
annotations:
|
||||
name: ""
|
||||
inline:
|
||||
machine:
|
||||
network:
|
||||
|
||||
@@ -14,6 +14,11 @@
|
||||
}
|
||||
],
|
||||
"packageRules": [
|
||||
{
|
||||
"description": "Consolidate patch and minor updates to one PR",
|
||||
"matchUpdateTypes": ["minor", "patch"],
|
||||
"groupName": "all-minor-patch-updates"
|
||||
},
|
||||
{
|
||||
"description": "Rook Ceph - auto-update minor and patch versions only",
|
||||
"matchDatasources": ["docker"],
|
||||
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: argo-cd
|
||||
version: 9.2.0
|
||||
version: 9.4.7
|
||||
repository: https://argoproj.github.io/argo-helm
|
||||
@@ -56,18 +56,6 @@ argo-cd:
|
||||
Argo CD has not reported any applications data for the past 15 minutes which
|
||||
means that it must be down or not functioning properly. This needs to be
|
||||
resolved for this cloud to continue to maintain state.
|
||||
- alert: ArgoAppNotSynced
|
||||
expr: |
|
||||
argocd_app_info{sync_status!="Synced"} == 1
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: '{{ $labels.name }} Application not synchronized'
|
||||
description: >
|
||||
The application {{ $labels.name }} has not been synchronized for over
|
||||
12 hours which means that the state of this cloud has drifted away from the
|
||||
state inside Git.
|
||||
server:
|
||||
ingress:
|
||||
enabled: true
|
||||
@@ -140,18 +128,30 @@ argo-cd:
|
||||
remoteRef:
|
||||
key: argo-cd
|
||||
property: webhook.gitea.secret
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: admin.password
|
||||
remoteRef:
|
||||
key: argo-cd
|
||||
property: admin.password
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: admin.passwordMtime
|
||||
remoteRef:
|
||||
key: argo-cd
|
||||
property: admin.passwordMtime
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: dex.authentik.clientSecret
|
||||
remoteRef:
|
||||
key: argo-cd
|
||||
property: dex.authentik.clientSecret
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
@@ -172,14 +172,23 @@ argo-cd:
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: sshPrivateKey
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: type
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: type
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: url
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: url.core-apps
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
@@ -200,14 +209,23 @@ argo-cd:
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: sshPrivateKey
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: type
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: type
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: url
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: url.weyma-talos
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
@@ -228,14 +246,23 @@ argo-cd:
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: sshPrivateKey
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: type
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: type
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: url
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: url.williamp-sites
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- apiVersion: external-secrets.io/v1
|
||||
kind: ExternalSecret
|
||||
metadata:
|
||||
@@ -256,11 +283,20 @@ argo-cd:
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: sshPrivateKey
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: type
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: type
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
- secretKey: url
|
||||
remoteRef:
|
||||
key: argo-cd-git
|
||||
property: url.db-operators
|
||||
conversionStrategy: Default
|
||||
decodingStrategy: None
|
||||
metadataPolicy: None
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: cert-manager
|
||||
version: v1.19.2
|
||||
version: v1.19.4
|
||||
repository: https://charts.jetstack.io
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: external-secrets
|
||||
version: 1.1.1
|
||||
version: 2.0.1
|
||||
repository: https://charts.external-secrets.io
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: kite
|
||||
version: 0.7.6
|
||||
version: 0.7.8
|
||||
repository: https://zxh326.github.io/kite
|
||||
@@ -1,5 +1,7 @@
|
||||
kite:
|
||||
host: "https://weyma-kite.infra.dubyatp.xyz"
|
||||
deploymentStrategy:
|
||||
type: Recreate
|
||||
secret:
|
||||
create: false
|
||||
existingSecret: kite-secret
|
||||
@@ -16,3 +18,5 @@ kite:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: ImplementationSpecific
|
||||
podAnnotations:
|
||||
backup.velero.io/backup-volumes: kite-storage
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: kubernetes-replicator
|
||||
version: 2.12.2
|
||||
version: 2.12.3
|
||||
repository: https://helm.mittwald.de
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: kube-prometheus-stack
|
||||
version: 80.6.0
|
||||
version: 82.6.1
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
@@ -21,7 +21,7 @@ spec:
|
||||
# versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
|
||||
# If you want to be more precise, you can always use a timestamp tag such as quay.io/ceph/ceph:v19.2.1-20250202
|
||||
# This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities
|
||||
image: quay.io/ceph/ceph:v19.2.3-20250717
|
||||
image: quay.io/ceph/ceph:v20.2.0-20251104
|
||||
# Whether to allow unsupported versions of Ceph. Currently Reef and Squid are supported.
|
||||
# Future versions such as Tentacle (v20) would require this to be set to `true`.
|
||||
# Do not set to true in production.
|
||||
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: rook-ceph
|
||||
version: v1.18.8
|
||||
version: v1.19.2
|
||||
repository: https://charts.rook.io/release
|
||||
@@ -497,61 +497,6 @@ spec:
|
||||
oid: "1.3.6.1.4.1.50495.1.2.1.8.1"
|
||||
severity: "critical"
|
||||
type: "ceph_default"
|
||||
- alert: "CephNodeNetworkPacketDrops"
|
||||
annotations:
|
||||
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ "{{" }} $labels.device {{ "}}" }}."
|
||||
summary: "One or more NICs reports packet drops"
|
||||
expr: |
|
||||
(
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) / (
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0050000000000000001 and (
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) >= 10
|
||||
labels:
|
||||
oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- alert: "CephNodeNetworkPacketErrors"
|
||||
annotations:
|
||||
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ "{{" }} $labels.device {{ "}}" }}."
|
||||
summary: "One or more NICs reports packet errors"
|
||||
expr: |
|
||||
(
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) / (
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0001 or (
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) >= 10
|
||||
labels:
|
||||
oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- alert: "CephNodeNetworkBondDegraded"
|
||||
annotations:
|
||||
description: "Bond {{ "{{" }} $labels.master {{ "}}" }} is degraded on Node {{ "{{" }} $labels.instance {{ "}}" }}."
|
||||
summary: "Degraded Bond on Node {{ "{{" }} $labels.instance {{ "}}" }}"
|
||||
expr: |
|
||||
node_bonding_slaves - node_bonding_active != 0
|
||||
labels:
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- alert: "CephNodeInconsistentMTU"
|
||||
annotations:
|
||||
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has a different MTU size ({{ "{{" }} $value {{ "}}" }}) than the median of devices named {{ "{{" }} $labels.device {{ "}}" }}."
|
||||
summary: "MTU settings across Ceph hosts are inconsistent"
|
||||
expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )"
|
||||
labels:
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- name: "pools"
|
||||
rules:
|
||||
- alert: "CephPoolGrowthWarning"
|
||||
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: traefik
|
||||
version: 38.0.1
|
||||
version: 39.0.2
|
||||
repository: https://traefik.github.io/charts
|
||||
@@ -4,6 +4,7 @@ traefik:
|
||||
- --entryPoints.websecure.transport.respondingTimeouts.readTimeout=0
|
||||
ports:
|
||||
web:
|
||||
http:
|
||||
redirections:
|
||||
entryPoint:
|
||||
to: websecure
|
||||
@@ -14,8 +15,6 @@ traefik:
|
||||
exposedPort: 22
|
||||
expose:
|
||||
default: true
|
||||
tls:
|
||||
passthrough: true
|
||||
metrics:
|
||||
prometheus:
|
||||
service:
|
||||
@@ -38,7 +37,7 @@ traefik:
|
||||
kind: DaemonSet
|
||||
additionalContainers:
|
||||
- name: cloudflared
|
||||
image: cloudflare/cloudflared:2025.11.1
|
||||
image: cloudflare/cloudflared:2026.2.0
|
||||
command:
|
||||
- cloudflared
|
||||
- tunnel
|
||||
@@ -130,3 +129,26 @@ traefik:
|
||||
data:
|
||||
tls.crt: ""
|
||||
tls.key: ""
|
||||
- apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: traefik-local
|
||||
spec:
|
||||
sessionAffinity: ClientIP
|
||||
sessionAffinityConfig:
|
||||
clientIP:
|
||||
timeoutSeconds: 3600
|
||||
selector:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
ports:
|
||||
- name: gitssh
|
||||
port: 22
|
||||
targetPort: gitssh
|
||||
- name: web
|
||||
port: 80
|
||||
targetPort: web
|
||||
- name: websecure
|
||||
port: 443
|
||||
targetPort: websecure
|
||||
type: ClusterIP
|
||||
@@ -24,5 +24,5 @@ appVersion: "1.0"
|
||||
|
||||
dependencies:
|
||||
- name: velero
|
||||
version: 11.2.0
|
||||
version: 11.4.0
|
||||
repository: https://vmware-tanzu.github.io/helm-charts
|
||||
@@ -59,7 +59,7 @@ velero:
|
||||
insecureSkipTLSVerify: "true"
|
||||
initContainers:
|
||||
- name: velero-plugin-for-aws
|
||||
image: velero/velero-plugin-for-aws:v1.13.1
|
||||
image: velero/velero-plugin-for-aws:v1.13.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
volumeMounts:
|
||||
- mountPath: /target
|
||||
|
||||
Reference in New Issue
Block a user