cmd/containerboot,cmd/k8s-proxy,kube: add authkey renewal to k8s-proxy (#19221)

* kube/authkey,cmd/containerboot: extract shared auth key reissue package

Move auth key reissue logic (set marker, wait for new key, clear marker,
read config) into a shared kube/authkey package and update containerboot
to use it. No behaviour change.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

* kube/authkey,kube/state,cmd/containerboot: preserve device_id across restarts

Stop clearing device_id, device_fqdn, and device_ips from state on startup.
These keys are now preserved across restarts so the operator can track
device identity. Expand ClearReissueAuthKey to clear device state and
tailscaled profile data when performing a full auth key reissue.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

* cmd/containerboot: use root context for auth key reissue wait

Pass the root context instead of bootCtx to setAndWaitForAuthKeyReissue.
The 60-second bootCtx timeout was cancelling the reissue wait before the
operator had time to respond, causing the pod to crash-loop.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

* cmd/k8s-proxy: add auth key renewal support

Add auth key reissue handling to k8s-proxy, mirroring containerboot.
When the proxy detects an auth failure (login-state health warning or
NeedsLogin state), it disconnects from control, signals the operator
via the state Secret, waits for a new key, clears stale state, and
exits so Kubernetes restarts the pod with the new key.

A health watcher goroutine runs alongside ts.Up() to short-circuit
the startup timeout on terminal auth failures.

Updates #14080

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>

---------

Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>
This commit is contained in:
Tom Meadows
2026-04-15 16:13:46 +01:00
committed by GitHub
parent dbf468740b
commit 5eb0b4be31
10 changed files with 666 additions and 143 deletions
+79 -8
View File
@@ -31,6 +31,7 @@ import (
"k8s.io/utils/strings/slices"
"tailscale.com/client/local"
"tailscale.com/cmd/k8s-proxy/internal/config"
"tailscale.com/health"
"tailscale.com/hostinfo"
"tailscale.com/ipn"
"tailscale.com/ipn/store"
@@ -41,6 +42,7 @@ import (
"tailscale.com/kube/certs"
healthz "tailscale.com/kube/health"
"tailscale.com/kube/k8s-proxy/conf"
"tailscale.com/kube/kubeclient"
"tailscale.com/kube/kubetypes"
klc "tailscale.com/kube/localclient"
"tailscale.com/kube/metrics"
@@ -171,10 +173,31 @@ func run(logger *zap.SugaredLogger) error {
// If Pod UID unset, assume we're running outside of a cluster/not managed
// by the operator, so no need to set additional state keys.
var kc kubeclient.Client
var stateSecretName string
if podUID != "" {
if err := state.SetInitialKeys(st, podUID); err != nil {
return fmt.Errorf("error setting initial state: %w", err)
}
if cfg.Parsed.State != nil {
if name, ok := strings.CutPrefix(*cfg.Parsed.State, "kube:"); ok {
stateSecretName = name
kc, err = kubeclient.New(k8sProxyFieldManager)
if err != nil {
return err
}
var configAuthKey string
if cfg.Parsed.AuthKey != nil {
configAuthKey = *cfg.Parsed.AuthKey
}
if err := resetState(ctx, kc, stateSecretName, podUID, configAuthKey); err != nil {
return fmt.Errorf("error resetting state: %w", err)
}
}
}
}
var authKey string
@@ -197,23 +220,69 @@ func run(logger *zap.SugaredLogger) error {
ts.Hostname = *cfg.Parsed.Hostname
}
// Make sure we crash loop if Up doesn't complete in reasonable time.
upCtx, upCancel := context.WithTimeout(ctx, time.Minute)
defer upCancel()
if _, err := ts.Up(upCtx); err != nil {
return fmt.Errorf("error starting tailscale server: %w", err)
}
defer ts.Close()
lc, err := ts.LocalClient()
if err != nil {
return fmt.Errorf("error getting local client: %w", err)
}
// Setup for updating state keys.
// Make sure we crash loop if Up doesn't complete in reasonable time.
upCtx, upCancel := context.WithTimeout(ctx, 30*time.Second)
defer upCancel()
// ts.Up() deliberately ignores NeedsLogin because it fires transiently
// during normal auth-key login. We can watch for the login-state health
// warning here though, which only fires on terminal auth failure, and
// cancel early.
go func() {
w, err := lc.WatchIPNBus(upCtx, ipn.NotifyInitialHealthState)
if err != nil {
return
}
defer w.Close()
for {
n, err := w.Next()
if err != nil {
logger.Debugf("failed to process message from ipn bus: %s", err.Error())
return
}
if n.Health != nil {
if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok {
upCancel()
return
}
}
}
}()
if _, err := ts.Up(upCtx); err != nil {
if kc != nil && stateSecretName != "" {
return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger)
}
return err
}
defer ts.Close()
reissueCh := make(chan struct{}, 1)
if podUID != "" {
group.Go(func() error {
return state.KeepKeysUpdated(ctx, st, klc.New(lc))
})
if kc != nil && stateSecretName != "" {
needsReissue, err := checkInitialAuthState(ctx, lc)
if err != nil {
return fmt.Errorf("error checking initial auth state: %w", err)
}
if needsReissue {
logger.Info("Auth key missing or invalid after startup, requesting new key from operator")
return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger)
}
group.Go(func() error {
return monitorAuthHealth(ctx, lc, reissueCh, logger)
})
}
}
if cfg.Parsed.HealthCheckEnabled.EqualBool(true) || cfg.Parsed.MetricsEnabled.EqualBool(true) {
@@ -362,6 +431,8 @@ func run(logger *zap.SugaredLogger) error {
}
cfgLogger.Infof("Config reloaded")
case <-reissueCh:
return handleAuthKeyReissue(ctx, lc, kc, stateSecretName, authKey, cfgChan, logger)
}
}
}