Compare commits

..

1 Commits

16 changed files with 21 additions and 170 deletions

View File

@@ -1,37 +0,0 @@
# Main Infrastructure: weyma-talos
**Production Kubernetes infrastructure with disaster recovery capabilities**
This repository contains the foundational infrastructure for my Kubernetes homelab, designed with reliability and rapid recovery as core principles.
## Architecture
My infrastructure follows a layered "black start" approach - essential services run outside the Kubernetes cluster to enable cluster bootstrapping and recovery from total failures.
### Black Start Layer
Static services (Docker Compose on TrueNAS/Proxmox) that provide cluster dependencies:
- Image cache for faster deployments and offline capability
- Talos discovery server for node bootstrapping
- HashiCorp Vault for secrets management (external to cluster)
- Future: Self-hosted Sidero Omni server (migrating from SaaS)
### System Apps Layer
Applications running within Kubernetes that provide core cluster functionality, managed via ArgoCD with GitOps principles.
## Repository Structure
- **`black-start/`** - Docker Compose services for cluster dependencies
- **`config-patches/`** - Talos Linux configuration patches for cluster and individual machines
- **`omni/`** - Sidero Omni [cluster template](https://docs.siderolabs.com/omni/reference/cluster-templates)
- **`system-apps/`** - System applications (ArgoCD projects) - monitoring, ingress, certificates, storage
## Tech Stack
**OS:** Talos Linux | **Orchestration:** Kubernetes | **GitOps:** ArgoCD | **Secrets:** Vault | **Storage:** Rook-Ceph
## Recovery Process
The "black start" architecture enables ~15-20 minute automated recovery from complete infrastructure failure:
1. Start black-start services → 2. Bootstrap Talos → 3. Deploy system apps → 4. Deploy core apps
For application deployments, see [core-apps](https://git.dubyatp.xyz/core-apps).

View File

@@ -2,7 +2,7 @@ version: "3.8"
services:
discovery:
restart: unless-stopped
image: ghcr.io/siderolabs/discovery-service:v1.0.15
image: ghcr.io/siderolabs/discovery-service:v1.0.13
ports:
- 10.105.6.215:3000:3000
- 10.105.6.215:3001:3001

View File

@@ -52,7 +52,6 @@ patches:
bind-address: 0.0.0.0
proxy:
extraArgs:
proxy-mode: ipvs
metrics-bind-address: 0.0.0.0:10249
scheduler:
extraArgs:
@@ -288,45 +287,6 @@ patches:
selector:
k8s-app: metrics-server
name: metrics-lb
- contents: |-
apiVersion: v1
data:
Corefile: |
.:53 {
errors
health {
lameduck 5s
}
ready
log . {
class error
}
prometheus :9153
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 30
}
rewrite name git.dubyatp.xyz traefik-local.traefik.svc.cluster.local
forward . /etc/resolv.conf {
max_concurrent 1000
}
cache 30 {
disable success cluster.local
disable denial cluster.local
}
loop
reload
loadbalance
}
kind: ConfigMap
metadata:
name: coredns
namespace: kube-system
name: coredns-config
---
kind: ControlPlane
machines:

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: argo-cd
version: 9.4.6
version: 9.3.4
repository: https://argoproj.github.io/argo-helm

View File

@@ -128,30 +128,18 @@ argo-cd:
remoteRef:
key: argo-cd
property: webhook.gitea.secret
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: admin.password
remoteRef:
key: argo-cd
property: admin.password
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: admin.passwordMtime
remoteRef:
key: argo-cd
property: admin.passwordMtime
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: dex.authentik.clientSecret
remoteRef:
key: argo-cd
property: dex.authentik.clientSecret
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
@@ -172,23 +160,14 @@ argo-cd:
remoteRef:
key: argo-cd-git
property: sshPrivateKey
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: type
remoteRef:
key: argo-cd-git
property: type
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: url
remoteRef:
key: argo-cd-git
property: url.core-apps
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
@@ -209,23 +188,14 @@ argo-cd:
remoteRef:
key: argo-cd-git
property: sshPrivateKey
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: type
remoteRef:
key: argo-cd-git
property: type
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: url
remoteRef:
key: argo-cd-git
property: url.weyma-talos
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
@@ -246,23 +216,14 @@ argo-cd:
remoteRef:
key: argo-cd-git
property: sshPrivateKey
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: type
remoteRef:
key: argo-cd-git
property: type
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: url
remoteRef:
key: argo-cd-git
property: url.williamp-sites
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
@@ -283,20 +244,11 @@ argo-cd:
remoteRef:
key: argo-cd-git
property: sshPrivateKey
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: type
remoteRef:
key: argo-cd-git
property: type
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
- secretKey: url
remoteRef:
key: argo-cd-git
property: url.db-operators
conversionStrategy: Default
decodingStrategy: None
metadataPolicy: None
property: url.db-operators

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: cert-manager
version: v1.19.4
version: v1.19.2
repository: https://charts.jetstack.io

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: external-secrets
version: 2.0.1
version: 1.2.1
repository: https://charts.external-secrets.io

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: kite
version: 0.7.8
version: 0.7.7
repository: https://zxh326.github.io/kite

View File

@@ -1,7 +1,5 @@
kite:
host: "https://weyma-kite.infra.dubyatp.xyz"
deploymentStrategy:
type: Recreate
secret:
create: false
existingSecret: kite-secret

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: kubernetes-replicator
version: 2.12.3
version: 2.12.2
repository: https://helm.mittwald.de

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: kube-prometheus-stack
version: 82.4.3
version: 81.0.0
repository: https://prometheus-community.github.io/helm-charts

View File

@@ -21,7 +21,7 @@ spec:
# versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
# If you want to be more precise, you can always use a timestamp tag such as quay.io/ceph/ceph:v19.2.1-20250202
# This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities
image: quay.io/ceph/ceph:v20.2.0-20251104
image: quay.io/ceph/ceph:v19.2.3-20250717
# Whether to allow unsupported versions of Ceph. Currently Reef and Squid are supported.
# Future versions such as Tentacle (v20) would require this to be set to `true`.
# Do not set to true in production.

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: rook-ceph
version: v1.19.2
version: v1.18.9
repository: https://charts.rook.io/release

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: traefik
version: 39.0.2
version: 38.0.2
repository: https://traefik.github.io/charts

View File

@@ -4,17 +4,18 @@ traefik:
- --entryPoints.websecure.transport.respondingTimeouts.readTimeout=0
ports:
web:
http:
redirections:
entryPoint:
to: websecure
scheme: https
permanent: true
redirections:
entryPoint:
to: websecure
scheme: https
permanent: true
gitssh:
port: 2222
exposedPort: 22
expose:
default: true
tls:
passthrough: true
metrics:
prometheus:
service:
@@ -37,7 +38,7 @@ traefik:
kind: DaemonSet
additionalContainers:
- name: cloudflared
image: cloudflare/cloudflared:2026.2.0
image: cloudflare/cloudflared:2025.11.1
command:
- cloudflared
- tunnel
@@ -128,27 +129,4 @@ traefik:
replicator.v1.mittwald.de/replicated-keys: "tls.crt,tls.key"
data:
tls.crt: ""
tls.key: ""
- apiVersion: v1
kind: Service
metadata:
name: traefik-local
spec:
sessionAffinity: ClientIP
sessionAffinityConfig:
clientIP:
timeoutSeconds: 3600
selector:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
ports:
- name: gitssh
port: 22
targetPort: gitssh
- name: web
port: 80
targetPort: web
- name: websecure
port: 443
targetPort: websecure
type: ClusterIP
tls.key: ""

View File

@@ -24,5 +24,5 @@ appVersion: "1.0"
dependencies:
- name: velero
version: 11.4.0
version: 11.3.2
repository: https://vmware-tanzu.github.io/helm-charts