cmd/{containerboot,k8s-operator}: reissue auth keys for broken proxies (#16450)
Adds logic for containerboot to signal that it can't auth, so the
operator can reissue a new auth key. This only applies when running with
a config file and with a kube state store.
If the operator sees reissue_authkey in a state Secret, it will create a
new auth key iff the config has no auth key or its auth key matches the
value of reissue_authkey from the state Secret. This is to ensure we
don't reissue auth keys in a tight loop if the proxy is slow to start or
failing for some other reason. The reissue logic also uses a burstable
rate limiter to ensure there's no way a terminally misconfigured
or buggy operator can automatically generate new auth keys in a tight loop.
Additional implementation details (ChaosInTheCRD):
- Added `ipn.NotifyInitialHealthState` to ipn watcher, to ensure that
`n.Health` is populated when notify's are returned.
- on auth failure, containerboot:
- Disconnects from control server
- Sets reissue_authkey marker in state Secret with the failing key
- Polls config file for new auth key (10 minute timeout)
- Restarts after receiving new key to apply it
- modified operator's reissue logic slightly:
- Deletes old device from tailnet before creating new key
- Rate limiting: 1 key per 30s with initial burst equal to replica count
- In-flight tracking (authKeyReissuing map) prevents duplicate API calls
across reconcile loops
Updates #14080
Change-Id: I6982f8e741932a6891f2f48a2936f7f6a455317f
(cherry picked from commit 969927c47c3d4de05e90f5b26a6d8d931c5ceed4)
Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
Co-authored-by: chaosinthecrd <tom@tmlabs.co.uk>
This commit is contained in:
@@ -137,7 +137,9 @@ import (
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
"tailscale.com/client/tailscale"
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/conffile"
|
||||
kubeutils "tailscale.com/k8s-operator"
|
||||
healthz "tailscale.com/kube/health"
|
||||
"tailscale.com/kube/kubetypes"
|
||||
@@ -206,6 +208,11 @@ func run() error {
|
||||
bootCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var tailscaledConfigAuthkey string
|
||||
if isOneStepConfig(cfg) {
|
||||
tailscaledConfigAuthkey = authkeyFromTailscaledConfig(cfg.TailscaledConfigFilePath)
|
||||
}
|
||||
|
||||
var kc *kubeClient
|
||||
if cfg.KubeSecret != "" {
|
||||
kc, err = newKubeClient(cfg.Root, cfg.KubeSecret)
|
||||
@@ -219,7 +226,7 @@ func run() error {
|
||||
// hasKubeStateStore because although we know we're in kube, that
|
||||
// doesn't guarantee the state store is properly configured.
|
||||
if hasKubeStateStore(cfg) {
|
||||
if err := kc.resetContainerbootState(bootCtx, cfg.PodUID); err != nil {
|
||||
if err := kc.resetContainerbootState(bootCtx, cfg.PodUID, tailscaledConfigAuthkey); err != nil {
|
||||
return fmt.Errorf("error clearing previous state from Secret: %w", err)
|
||||
}
|
||||
}
|
||||
@@ -299,7 +306,7 @@ func run() error {
|
||||
}
|
||||
}
|
||||
|
||||
w, err := client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialPrefs|ipn.NotifyInitialState)
|
||||
w, err := client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialPrefs|ipn.NotifyInitialState|ipn.NotifyInitialHealthState)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to watch tailscaled for updates: %w", err)
|
||||
}
|
||||
@@ -365,8 +372,23 @@ authLoop:
|
||||
if isOneStepConfig(cfg) {
|
||||
// This could happen if this is the first time tailscaled was run for this
|
||||
// device and the auth key was not passed via the configfile.
|
||||
return fmt.Errorf("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file.")
|
||||
if hasKubeStateStore(cfg) {
|
||||
log.Printf("Auth key missing or invalid (NeedsLogin state), disconnecting from control and requesting new key from operator")
|
||||
|
||||
err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get a reissued authkey: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("Successfully received new auth key, restarting to apply configuration")
|
||||
|
||||
// we don't return an error here since we have handled the reissue gracefully.
|
||||
return nil
|
||||
}
|
||||
|
||||
return errors.New("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file")
|
||||
}
|
||||
|
||||
if err := authTailscale(); err != nil {
|
||||
return fmt.Errorf("failed to auth tailscale: %w", err)
|
||||
}
|
||||
@@ -384,6 +406,27 @@ authLoop:
|
||||
log.Printf("tailscaled in state %q, waiting", *n.State)
|
||||
}
|
||||
}
|
||||
|
||||
if n.Health != nil {
|
||||
// This can happen if the config has an auth key but it's invalid,
|
||||
// for example if it was single-use and already got used, but the
|
||||
// device state was lost.
|
||||
if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok {
|
||||
if isOneStepConfig(cfg) && hasKubeStateStore(cfg) {
|
||||
log.Printf("Auth key failed to authenticate (may be expired or single-use), disconnecting from control and requesting new key from operator")
|
||||
|
||||
err := kc.setAndWaitForAuthKeyReissue(bootCtx, client, cfg, tailscaledConfigAuthkey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get a reissued authkey: %w", err)
|
||||
}
|
||||
|
||||
// we don't return an error here since we have handled the reissue gracefully.
|
||||
log.Printf("Successfully received new auth key, restarting to apply configuration")
|
||||
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
w.Close()
|
||||
@@ -409,9 +452,9 @@ authLoop:
|
||||
// We were told to only auth once, so any secret-bound
|
||||
// authkey is no longer needed. We don't strictly need to
|
||||
// wipe it, but it's good hygiene.
|
||||
log.Printf("Deleting authkey from kube secret")
|
||||
log.Printf("Deleting authkey from Kubernetes Secret")
|
||||
if err := kc.deleteAuthKey(ctx); err != nil {
|
||||
return fmt.Errorf("deleting authkey from kube secret: %w", err)
|
||||
return fmt.Errorf("deleting authkey from Kubernetes Secret: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -422,8 +465,10 @@ authLoop:
|
||||
|
||||
// If tailscaled config was read from a mounted file, watch the file for updates and reload.
|
||||
cfgWatchErrChan := make(chan error)
|
||||
cfgWatchCtx, cfgWatchCancel := context.WithCancel(ctx)
|
||||
defer cfgWatchCancel()
|
||||
if cfg.TailscaledConfigFilePath != "" {
|
||||
go watchTailscaledConfigChanges(ctx, cfg.TailscaledConfigFilePath, client, cfgWatchErrChan)
|
||||
go watchTailscaledConfigChanges(cfgWatchCtx, cfg.TailscaledConfigFilePath, client, cfgWatchErrChan)
|
||||
}
|
||||
|
||||
var (
|
||||
@@ -523,6 +568,7 @@ runLoop:
|
||||
case err := <-cfgWatchErrChan:
|
||||
return fmt.Errorf("failed to watch tailscaled config: %w", err)
|
||||
case n := <-notifyChan:
|
||||
// TODO: (ChaosInTheCRD) Add node removed check when supported by ipn
|
||||
if n.State != nil && *n.State != ipn.Running {
|
||||
// Something's gone wrong and we've left the authenticated state.
|
||||
// Our container image never recovered gracefully from this, and the
|
||||
@@ -979,3 +1025,11 @@ func serviceIPsFromNetMap(nm *netmap.NetworkMap, fqdn dnsname.FQDN) []netip.Pref
|
||||
|
||||
return prefixes
|
||||
}
|
||||
|
||||
func authkeyFromTailscaledConfig(path string) string {
|
||||
if cfg, err := conffile.Load(path); err == nil && cfg.Parsed.AuthKey != nil {
|
||||
return *cfg.Parsed.AuthKey
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user