cmd/containerboot,cmd/k8s-proxy,kube: add authkey renewal to k8s-proxy (#19221)

* kube/authkey,cmd/containerboot: extract shared auth key reissue package Move auth key reissue logic (set marker, wait for new key, clear marker, read config) into a shared kube/authkey package and update containerboot to use it. No behaviour change. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> * kube/authkey,kube/state,cmd/containerboot: preserve device_id across restarts Stop clearing device_id, device_fqdn, and device_ips from state on startup. These keys are now preserved across restarts so the operator can track device identity. Expand ClearReissueAuthKey to clear device state and tailscaled profile data when performing a full auth key reissue. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> * cmd/containerboot: use root context for auth key reissue wait Pass the root context instead of bootCtx to setAndWaitForAuthKeyReissue. The 60-second bootCtx timeout was cancelling the reissue wait before the operator had time to respond, causing the pod to crash-loop. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> * cmd/k8s-proxy: add auth key renewal support Add auth key reissue handling to k8s-proxy, mirroring containerboot. When the proxy detects an auth failure (login-state health warning or NeedsLogin state), it disconnects from control, signals the operator via the state Secret, waits for a new key, clears stale state, and exits so Kubernetes restarts the pod with the new key. A health watcher goroutine runs alongside ts.Up() to short-circuit the startup timeout on terminal auth failures. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> --------- Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>
2026-04-15 16:13:46 +01:00
parent dbf468740b
commit 5eb0b4be31
10 changed files with 666 additions and 143 deletions
@@ -31,6 +31,7 @@ import (
 	"k8s.io/utils/strings/slices"
 	"tailscale.com/client/local"
 	"tailscale.com/cmd/k8s-proxy/internal/config"
+	"tailscale.com/health"
 	"tailscale.com/hostinfo"
 	"tailscale.com/ipn"
 	"tailscale.com/ipn/store"
@@ -41,6 +42,7 @@ import (
 	"tailscale.com/kube/certs"
 	healthz "tailscale.com/kube/health"
 	"tailscale.com/kube/k8s-proxy/conf"
+	"tailscale.com/kube/kubeclient"
 	"tailscale.com/kube/kubetypes"
 	klc "tailscale.com/kube/localclient"
 	"tailscale.com/kube/metrics"
@@ -171,10 +173,31 @@ func run(logger *zap.SugaredLogger) error {

 	// If Pod UID unset, assume we're running outside of a cluster/not managed
 	// by the operator, so no need to set additional state keys.
+	var kc kubeclient.Client
+	var stateSecretName string
 	if podUID != "" {
 		if err := state.SetInitialKeys(st, podUID); err != nil {
 			return fmt.Errorf("error setting initial state: %w", err)
 		}
+
+		if cfg.Parsed.State != nil {
+			if name, ok := strings.CutPrefix(*cfg.Parsed.State, "kube:"); ok {
+				stateSecretName = name
+
+				kc, err = kubeclient.New(k8sProxyFieldManager)
+				if err != nil {
+					return err
+				}
+
+				var configAuthKey string
+				if cfg.Parsed.AuthKey != nil {
+					configAuthKey = *cfg.Parsed.AuthKey
+				}
+				if err := resetState(ctx, kc, stateSecretName, podUID, configAuthKey); err != nil {
+					return fmt.Errorf("error resetting state: %w", err)
+				}
+			}
+		}
 	}

 	var authKey string
@@ -197,23 +220,69 @@ func run(logger *zap.SugaredLogger) error {
 		ts.Hostname = *cfg.Parsed.Hostname
 	}

-	// Make sure we crash loop if Up doesn't complete in reasonable time.
-	upCtx, upCancel := context.WithTimeout(ctx, time.Minute)
-	defer upCancel()
-	if _, err := ts.Up(upCtx); err != nil {
-		return fmt.Errorf("error starting tailscale server: %w", err)
-	}
-	defer ts.Close()
 	lc, err := ts.LocalClient()
 	if err != nil {
 		return fmt.Errorf("error getting local client: %w", err)
 	}

-	// Setup for updating state keys.
+	// Make sure we crash loop if Up doesn't complete in reasonable time.
+	upCtx, upCancel := context.WithTimeout(ctx, 30*time.Second)
+	defer upCancel()
+
+	// ts.Up() deliberately ignores NeedsLogin because it fires transiently
+	// during normal auth-key login. We can watch for the login-state health
+	// warning here though, which only fires on terminal auth failure, and
+	// cancel early.
+	go func() {
+		w, err := lc.WatchIPNBus(upCtx, ipn.NotifyInitialHealthState)
+		if err != nil {
+			return
+		}
+		defer w.Close()
+		for {
+			n, err := w.Next()
+			if err != nil {
+				logger.Debugf("failed to process message from ipn bus: %s", err.Error())
+				return
+			}
+			if n.Health != nil {
+				if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok {
+					upCancel()
+					return
+				}
+			}
+		}
+	}()
+
+	if _, err := ts.Up(upCtx); err != nil {
+		if kc != nil && stateSecretName != "" {
+			return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger)
+		}
+		return err
+	}
+
+	defer ts.Close()
+
+	reissueCh := make(chan struct{}, 1)
 	if podUID != "" {
 		group.Go(func() error {
 			return state.KeepKeysUpdated(ctx, st, klc.New(lc))
 		})
+
+		if kc != nil && stateSecretName != "" {
+			needsReissue, err := checkInitialAuthState(ctx, lc)
+			if err != nil {
+				return fmt.Errorf("error checking initial auth state: %w", err)
+			}
+			if needsReissue {
+				logger.Info("Auth key missing or invalid after startup, requesting new key from operator")
+				return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger)
+			}
+
+			group.Go(func() error {
+				return monitorAuthHealth(ctx, lc, reissueCh, logger)
+			})
+		}
 	}

 	if cfg.Parsed.HealthCheckEnabled.EqualBool(true) || cfg.Parsed.MetricsEnabled.EqualBool(true) {
@@ -362,6 +431,8 @@ func run(logger *zap.SugaredLogger) error {
 			}

 			cfgLogger.Infof("Config reloaded")
+		case <-reissueCh:
+			return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger)
 		}
 	}
 }