cmd/containerboot,cmd/k8s-proxy,kube: add authkey renewal to k8s-proxy (#19221)
* kube/authkey,cmd/containerboot: extract shared auth key reissue package Move auth key reissue logic (set marker, wait for new key, clear marker, read config) into a shared kube/authkey package and update containerboot to use it. No behaviour change. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> * kube/authkey,kube/state,cmd/containerboot: preserve device_id across restarts Stop clearing device_id, device_fqdn, and device_ips from state on startup. These keys are now preserved across restarts so the operator can track device identity. Expand ClearReissueAuthKey to clear device state and tailscaled profile data when performing a full auth key reissue. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> * cmd/containerboot: use root context for auth key reissue wait Pass the root context instead of bootCtx to setAndWaitForAuthKeyReissue. The 60-second bootCtx timeout was cancelling the reissue wait before the operator had time to respond, causing the pod to crash-loop. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> * cmd/k8s-proxy: add auth key renewal support Add auth key reissue handling to k8s-proxy, mirroring containerboot. When the proxy detects an auth failure (login-state health warning or NeedsLogin state), it disconnects from control, signals the operator via the state Secret, waits for a new key, clears stale state, and exits so Kubernetes restarts the pod with the new key. A health watcher goroutine runs alongside ts.Up() to short-circuit the startup timeout on terminal auth failures. Updates #14080 Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk> --------- Signed-off-by: chaosinthecrd <tom@tmlabs.co.uk>
This commit is contained in:
+28
-85
@@ -21,6 +21,7 @@ import (
|
||||
"github.com/fsnotify/fsnotify"
|
||||
"tailscale.com/client/local"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/kube/authkey"
|
||||
"tailscale.com/kube/egressservices"
|
||||
"tailscale.com/kube/ingressservices"
|
||||
"tailscale.com/kube/kubeapi"
|
||||
@@ -32,7 +33,6 @@ import (
|
||||
)
|
||||
|
||||
const fieldManager = "tailscale-container"
|
||||
const kubeletMountedConfigLn = "..data"
|
||||
|
||||
// kubeClient is a wrapper around Tailscale's internal kube client that knows how to talk to the kube API server. We use
|
||||
// this rather than any of the upstream Kubernetes client libaries to avoid extra imports.
|
||||
@@ -127,6 +127,9 @@ func (kc *kubeClient) deleteAuthKey(ctx context.Context) error {
|
||||
|
||||
// resetContainerbootState resets state from previous runs of containerboot to
|
||||
// ensure the operator doesn't use stale state when a Pod is first recreated.
|
||||
//
|
||||
// Device identity keys (device_id, device_fqdn, device_ips) are preserved so
|
||||
// the operator can clean up the old device from the control plane.
|
||||
func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string, tailscaledConfigAuthkey string) error {
|
||||
existingSecret, err := kc.GetSecret(ctx, kc.stateSecret)
|
||||
switch {
|
||||
@@ -139,12 +142,7 @@ func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string
|
||||
|
||||
s := &kubeapi.Secret{
|
||||
Data: map[string][]byte{
|
||||
kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion),
|
||||
|
||||
// TODO(tomhjp): Perhaps shouldn't clear device ID and use a different signal, as this could leak tailnet devices.
|
||||
kubetypes.KeyDeviceID: nil,
|
||||
kubetypes.KeyDeviceFQDN: nil,
|
||||
kubetypes.KeyDeviceIPs: nil,
|
||||
kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion),
|
||||
kubetypes.KeyHTTPSEndpoint: nil,
|
||||
egressservices.KeyEgressServices: nil,
|
||||
ingressservices.IngressConfigKey: nil,
|
||||
@@ -169,47 +167,18 @@ func (kc *kubeClient) setAndWaitForAuthKeyReissue(ctx context.Context, client *l
|
||||
return fmt.Errorf("error disconnecting from control: %w", err)
|
||||
}
|
||||
|
||||
err = kc.setReissueAuthKey(ctx, tailscaledConfigAuthKey)
|
||||
err = authkey.SetReissueAuthKey(ctx, kc.Client, kc.stateSecret, tailscaledConfigAuthKey, authkey.TailscaleContainerFieldManager)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret: %w", err)
|
||||
}
|
||||
|
||||
err = kc.waitForAuthKeyReissue(ctx, cfg.TailscaledConfigFilePath, tailscaledConfigAuthKey, 10*time.Minute)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to receive new auth key: %w", err)
|
||||
clearFn := func(ctx context.Context) error {
|
||||
return authkey.ClearReissueAuthKey(ctx, kc.Client, kc.stateSecret, authkey.TailscaleContainerFieldManager)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (kc *kubeClient) setReissueAuthKey(ctx context.Context, authKey string) error {
|
||||
s := &kubeapi.Secret{
|
||||
Data: map[string][]byte{
|
||||
kubetypes.KeyReissueAuthkey: []byte(authKey),
|
||||
},
|
||||
}
|
||||
|
||||
log.Printf("Requesting a new auth key from operator")
|
||||
return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager)
|
||||
}
|
||||
|
||||
func (kc *kubeClient) waitForAuthKeyReissue(ctx context.Context, configPath string, oldAuthKey string, maxWait time.Duration) error {
|
||||
log.Printf("Waiting for operator to provide new auth key (max wait: %v)", maxWait)
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, maxWait)
|
||||
defer cancel()
|
||||
|
||||
tailscaledCfgDir := filepath.Dir(configPath)
|
||||
toWatch := filepath.Join(tailscaledCfgDir, kubeletMountedConfigLn)
|
||||
|
||||
var (
|
||||
pollTicker <-chan time.Time
|
||||
eventChan <-chan fsnotify.Event
|
||||
)
|
||||
|
||||
pollInterval := 5 * time.Second
|
||||
|
||||
// Try to use fsnotify for faster notification
|
||||
getAuthKey := func() string { return authkey.AuthKeyFromConfig(cfg.TailscaledConfigFilePath) }
|
||||
tailscaledCfgDir := filepath.Dir(cfg.TailscaledConfigFilePath)
|
||||
var notify <-chan struct{}
|
||||
if w, err := fsnotify.NewWatcher(); err != nil {
|
||||
log.Printf("auth key reissue: fsnotify unavailable, using polling: %v", err)
|
||||
} else if err := w.Add(tailscaledCfgDir); err != nil {
|
||||
@@ -217,54 +186,28 @@ func (kc *kubeClient) waitForAuthKeyReissue(ctx context.Context, configPath stri
|
||||
log.Printf("auth key reissue: fsnotify watch failed, using polling: %v", err)
|
||||
} else {
|
||||
defer w.Close()
|
||||
ch := make(chan struct{}, 1)
|
||||
toWatch := filepath.Join(tailscaledCfgDir, "..data")
|
||||
go func() {
|
||||
for ev := range w.Events {
|
||||
if ev.Name == toWatch {
|
||||
select {
|
||||
case ch <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
notify = ch
|
||||
log.Printf("auth key reissue: watching for config changes via fsnotify")
|
||||
eventChan = w.Events
|
||||
}
|
||||
|
||||
// still keep polling if using fsnotify, for logging and in case fsnotify fails
|
||||
pt := time.NewTicker(pollInterval)
|
||||
defer pt.Stop()
|
||||
pollTicker = pt.C
|
||||
|
||||
start := time.Now()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return fmt.Errorf("timeout waiting for auth key reissue after %v", maxWait)
|
||||
case <-pollTicker: // Waits for polling tick, continues when received
|
||||
case event := <-eventChan:
|
||||
if event.Name != toWatch {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
newAuthKey := authkeyFromTailscaledConfig(configPath)
|
||||
if newAuthKey != "" && newAuthKey != oldAuthKey {
|
||||
log.Printf("New auth key received from operator after %v", time.Since(start).Round(time.Second))
|
||||
|
||||
if err := kc.clearReissueAuthKeyRequest(ctx); err != nil {
|
||||
log.Printf("Warning: failed to clear reissue request: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if eventChan == nil && pollTicker != nil {
|
||||
log.Printf("Waiting for new auth key from operator (%v elapsed)", time.Since(start).Round(time.Second))
|
||||
}
|
||||
err = authkey.WaitForAuthKeyReissue(ctx, tailscaledConfigAuthKey, 10*time.Minute, getAuthKey, clearFn, notify)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to receive new auth key: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// clearReissueAuthKeyRequest removes the reissue_authkey marker from the Secret
|
||||
// to signal to the operator that we've successfully received the new key.
|
||||
func (kc *kubeClient) clearReissueAuthKeyRequest(ctx context.Context) error {
|
||||
s := &kubeapi.Secret{
|
||||
Data: map[string][]byte{
|
||||
kubetypes.KeyReissueAuthkey: nil,
|
||||
},
|
||||
}
|
||||
return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager)
|
||||
return nil
|
||||
}
|
||||
|
||||
// waitForConsistentState waits for tailscaled to finish writing state if it
|
||||
|
||||
@@ -257,12 +257,8 @@ func TestResetContainerbootState(t *testing.T) {
|
||||
authkey: "new-authkey",
|
||||
initial: map[string][]byte{},
|
||||
expected: map[string][]byte{
|
||||
kubetypes.KeyCapVer: capver,
|
||||
kubetypes.KeyPodUID: []byte("1234"),
|
||||
// Cleared keys.
|
||||
kubetypes.KeyDeviceID: nil,
|
||||
kubetypes.KeyDeviceFQDN: nil,
|
||||
kubetypes.KeyDeviceIPs: nil,
|
||||
kubetypes.KeyCapVer: capver,
|
||||
kubetypes.KeyPodUID: []byte("1234"),
|
||||
kubetypes.KeyHTTPSEndpoint: nil,
|
||||
egressservices.KeyEgressServices: nil,
|
||||
ingressservices.IngressConfigKey: nil,
|
||||
@@ -271,11 +267,7 @@ func TestResetContainerbootState(t *testing.T) {
|
||||
"empty_initial_no_pod_uid": {
|
||||
initial: map[string][]byte{},
|
||||
expected: map[string][]byte{
|
||||
kubetypes.KeyCapVer: capver,
|
||||
// Cleared keys.
|
||||
kubetypes.KeyDeviceID: nil,
|
||||
kubetypes.KeyDeviceFQDN: nil,
|
||||
kubetypes.KeyDeviceIPs: nil,
|
||||
kubetypes.KeyCapVer: capver,
|
||||
kubetypes.KeyHTTPSEndpoint: nil,
|
||||
egressservices.KeyEgressServices: nil,
|
||||
ingressservices.IngressConfigKey: nil,
|
||||
@@ -303,9 +295,6 @@ func TestResetContainerbootState(t *testing.T) {
|
||||
kubetypes.KeyCapVer: capver,
|
||||
kubetypes.KeyPodUID: []byte("1234"),
|
||||
// Cleared keys.
|
||||
kubetypes.KeyDeviceID: nil,
|
||||
kubetypes.KeyDeviceFQDN: nil,
|
||||
kubetypes.KeyDeviceIPs: nil,
|
||||
kubetypes.KeyHTTPSEndpoint: nil,
|
||||
egressservices.KeyEgressServices: nil,
|
||||
ingressservices.IngressConfigKey: nil,
|
||||
@@ -321,9 +310,6 @@ func TestResetContainerbootState(t *testing.T) {
|
||||
kubetypes.KeyCapVer: capver,
|
||||
kubetypes.KeyReissueAuthkey: nil,
|
||||
// Cleared keys.
|
||||
kubetypes.KeyDeviceID: nil,
|
||||
kubetypes.KeyDeviceFQDN: nil,
|
||||
kubetypes.KeyDeviceIPs: nil,
|
||||
kubetypes.KeyHTTPSEndpoint: nil,
|
||||
egressservices.KeyEgressServices: nil,
|
||||
ingressservices.IngressConfigKey: nil,
|
||||
@@ -338,9 +324,6 @@ func TestResetContainerbootState(t *testing.T) {
|
||||
kubetypes.KeyCapVer: capver,
|
||||
// reissue_authkey not cleared.
|
||||
// Cleared keys.
|
||||
kubetypes.KeyDeviceID: nil,
|
||||
kubetypes.KeyDeviceFQDN: nil,
|
||||
kubetypes.KeyDeviceIPs: nil,
|
||||
kubetypes.KeyHTTPSEndpoint: nil,
|
||||
egressservices.KeyEgressServices: nil,
|
||||
ingressservices.IngressConfigKey: nil,
|
||||
@@ -355,9 +338,6 @@ func TestResetContainerbootState(t *testing.T) {
|
||||
kubetypes.KeyCapVer: capver,
|
||||
// reissue_authkey not cleared.
|
||||
// Cleared keys.
|
||||
kubetypes.KeyDeviceID: nil,
|
||||
kubetypes.KeyDeviceFQDN: nil,
|
||||
kubetypes.KeyDeviceIPs: nil,
|
||||
kubetypes.KeyHTTPSEndpoint: nil,
|
||||
egressservices.KeyEgressServices: nil,
|
||||
ingressservices.IngressConfigKey: nil,
|
||||
|
||||
@@ -139,8 +139,8 @@ import (
|
||||
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/conffile"
|
||||
kubeutils "tailscale.com/k8s-operator"
|
||||
"tailscale.com/kube/authkey"
|
||||
healthz "tailscale.com/kube/health"
|
||||
"tailscale.com/kube/kubetypes"
|
||||
klc "tailscale.com/kube/localclient"
|
||||
@@ -209,7 +209,7 @@ func run() error {
|
||||
|
||||
var tailscaledConfigAuthkey string
|
||||
if isOneStepConfig(cfg) {
|
||||
tailscaledConfigAuthkey = authkeyFromTailscaledConfig(cfg.TailscaledConfigFilePath)
|
||||
tailscaledConfigAuthkey = authkey.AuthKeyFromConfig(cfg.TailscaledConfigFilePath)
|
||||
}
|
||||
|
||||
var kc *kubeClient
|
||||
@@ -374,7 +374,7 @@ authLoop:
|
||||
if hasKubeStateStore(cfg) {
|
||||
log.Printf("Auth key missing or invalid (NeedsLogin state), disconnecting from control and requesting new key from operator")
|
||||
|
||||
err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey)
|
||||
err := kc.setAndWaitForAuthKeyReissue(ctx, client, cfg, tailscaledConfigAuthkey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get a reissued authkey: %w", err)
|
||||
}
|
||||
@@ -414,7 +414,7 @@ authLoop:
|
||||
if isOneStepConfig(cfg) && hasKubeStateStore(cfg) {
|
||||
log.Printf("Auth key failed to authenticate (may be expired or single-use), disconnecting from control and requesting new key from operator")
|
||||
|
||||
err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey)
|
||||
err := kc.setAndWaitForAuthKeyReissue(ctx, client, cfg, tailscaledConfigAuthkey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get a reissued authkey: %w", err)
|
||||
}
|
||||
@@ -1024,11 +1024,3 @@ func serviceIPsFromNetMap(nm *netmap.NetworkMap, fqdn dnsname.FQDN) []netip.Pref
|
||||
|
||||
return prefixes
|
||||
}
|
||||
|
||||
func authkeyFromTailscaledConfig(path string) string {
|
||||
if cfg, err := conffile.Load(path); err == nil && cfg.Parsed.AuthKey != nil {
|
||||
return *cfg.Parsed.AuthKey
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user