cmd/{containerboot,k8s-operator}: reissue auth keys for broken proxies (#16450)

Adds logic for containerboot to signal that it can't auth, so the
operator can reissue a new auth key. This only applies when running with
a config file and with a kube state store.

If the operator sees reissue_authkey in a state Secret, it will create a
new auth key iff the config has no auth key or its auth key matches the
value of reissue_authkey from the state Secret. This is to ensure we
don't reissue auth keys in a tight loop if the proxy is slow to start or
failing for some other reason. The reissue logic also uses a burstable
rate limiter to ensure there's no way a terminally misconfigured
or buggy operator can automatically generate new auth keys in a tight loop.

Additional implementation details (ChaosInTheCRD):

- Added `ipn.NotifyInitialHealthState` to ipn watcher, to ensure that
  `n.Health` is populated when notify's are returned.
- on auth failure, containerboot:
  - Disconnects from control server
  - Sets reissue_authkey marker in state Secret with the failing key
  - Polls config file for new auth key (10 minute timeout)
  - Restarts after receiving new key to apply it

- modified operator's reissue logic slightly:
  - Deletes old device from tailnet before creating new key
  - Rate limiting: 1 key per 30s with initial burst equal to replica count
  - In-flight tracking (authKeyReissuing map) prevents duplicate API calls
    across reconcile loops

Updates #14080

Change-Id: I6982f8e741932a6891f2f48a2936f7f6a455317f


(cherry picked from commit 969927c47c3d4de05e90f5b26a6d8d931c5ceed4)

Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
Co-authored-by: chaosinthecrd <tom@tmlabs.co.uk>
This commit is contained in:
Tom Proctor
2026-03-11 10:25:57 +00:00
committed by GitHub
parent 7a43e41a27
commit 95a135ead1
11 changed files with 875 additions and 156 deletions
+174 -17
View File
@@ -32,6 +32,7 @@ import (
"github.com/google/go-cmp/cmp"
"golang.org/x/sys/unix"
"tailscale.com/health"
"tailscale.com/ipn"
"tailscale.com/kube/egressservices"
"tailscale.com/kube/kubeclient"
@@ -41,6 +42,8 @@ import (
"tailscale.com/types/netmap"
)
const configFileAuthKey = "some-auth-key"
func TestContainerBoot(t *testing.T) {
boot := filepath.Join(t.TempDir(), "containerboot")
if err := exec.Command("go", "build", "-ldflags", "-X main.testSleepDuration=1ms", "-o", boot, "tailscale.com/cmd/containerboot").Run(); err != nil {
@@ -77,6 +80,10 @@ func TestContainerBoot(t *testing.T) {
// phase (simulates our fake tailscaled doing it).
UpdateKubeSecret map[string]string
// Update files with these paths/contents at the beginning of the phase
// (simulates the operator updating mounted config files).
UpdateFiles map[string]string
// WantFiles files that should exist in the container and their
// contents.
WantFiles map[string]string
@@ -781,6 +788,127 @@ func TestContainerBoot(t *testing.T) {
},
}
},
"sets_reissue_authkey_if_needs_login": func(env *testEnv) testCase {
newAuthKey := "new-reissued-auth-key"
return testCase{
Env: map[string]string{
"TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"),
"KUBERNETES_SERVICE_HOST": env.kube.Host,
"KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port,
},
Phases: []phase{
{
UpdateFiles: map[string]string{
"etc/tailscaled/..data": "",
},
WantCmds: []string{
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson",
},
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
},
}, {
Notify: &ipn.Notify{
State: new(ipn.NeedsLogin),
},
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
kubetypes.KeyReissueAuthkey: configFileAuthKey,
},
WantLog: "watching for config changes via fsnotify",
}, {
UpdateFiles: map[string]string{
"etc/tailscaled/cap-95.hujson": fmt.Sprintf(`{"Version":"alpha0","AuthKey":"%s"}`, newAuthKey),
"etc/tailscaled/..data": "updated",
},
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
},
WantExitCode: new(0),
WantLog: "Successfully received new auth key, restarting to apply configuration",
},
},
}
},
"sets_reissue_authkey_if_auth_fails": func(env *testEnv) testCase {
newAuthKey := "new-reissued-auth-key"
return testCase{
Env: map[string]string{
"TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"),
"KUBERNETES_SERVICE_HOST": env.kube.Host,
"KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port,
},
Phases: []phase{
{
UpdateFiles: map[string]string{
"etc/tailscaled/..data": "",
},
WantCmds: []string{
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson",
},
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
},
}, {
Notify: &ipn.Notify{
Health: &health.State{
Warnings: map[health.WarnableCode]health.UnhealthyState{
health.LoginStateWarnable.Code: {},
},
},
},
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
kubetypes.KeyReissueAuthkey: configFileAuthKey,
},
WantLog: "watching for config changes via fsnotify",
}, {
UpdateFiles: map[string]string{
"etc/tailscaled/cap-95.hujson": fmt.Sprintf(`{"Version":"alpha0","AuthKey":"%s"}`, newAuthKey),
"etc/tailscaled/..data": "updated",
},
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
},
WantExitCode: new(0),
WantLog: "Successfully received new auth key, restarting to apply configuration",
},
},
}
},
"clears_reissue_authkey_on_change": func(env *testEnv) testCase {
return testCase{
Env: map[string]string{
"TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"),
"KUBERNETES_SERVICE_HOST": env.kube.Host,
"KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port,
},
KubeSecret: map[string]string{
kubetypes.KeyReissueAuthkey: "some-older-authkey",
"foo": "bar", // Check not everything is cleared.
},
Phases: []phase{
{
WantCmds: []string{
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson",
},
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
"foo": "bar",
},
}, {
Notify: runningNotify,
WantKubeSecret: map[string]string{
kubetypes.KeyCapVer: capver,
"foo": "bar",
kubetypes.KeyDeviceFQDN: "test-node.test.ts.net.",
kubetypes.KeyDeviceID: "myID",
kubetypes.KeyDeviceIPs: `["100.64.0.1"]`,
},
},
},
}
},
"metrics_enabled": func(env *testEnv) testCase {
return testCase{
Env: map[string]string{
@@ -1134,19 +1262,22 @@ func TestContainerBoot(t *testing.T) {
for k, v := range p.UpdateKubeSecret {
env.kube.SetSecret(k, v)
}
for path, content := range p.UpdateFiles {
fullPath := filepath.Join(env.d, path)
if err := os.WriteFile(fullPath, []byte(content), 0700); err != nil {
t.Fatalf("phase %d: updating file %q: %v", i, path, err)
}
// Explicitly update mtime to ensure fsnotify detects the change.
// Without this, file operations can be buffered and fsnotify events may not trigger.
now := time.Now()
if err := os.Chtimes(fullPath, now, now); err != nil {
t.Fatalf("phase %d: updating mtime for %q: %v", i, path, err)
}
}
env.lapi.Notify(p.Notify)
if p.Signal != nil {
cmd.Process.Signal(*p.Signal)
}
if p.WantLog != "" {
err := tstest.WaitFor(2*time.Second, func() error {
waitLogLine(t, time.Second, cbOut, p.WantLog)
return nil
})
if err != nil {
t.Fatal(err)
}
}
if p.WantExitCode != nil {
state, err := cmd.Process.Wait()
@@ -1156,14 +1287,19 @@ func TestContainerBoot(t *testing.T) {
if state.ExitCode() != *p.WantExitCode {
t.Fatalf("phase %d: want exit code %d, got %d", i, *p.WantExitCode, state.ExitCode())
}
// Early test return, we don't expect the successful startup log message.
return
}
wantCmds = append(wantCmds, p.WantCmds...)
waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n"))
err := tstest.WaitFor(2*time.Second, func() error {
if p.WantLog != "" {
err := tstest.WaitFor(5*time.Second, func() error {
waitLogLine(t, 5*time.Second, cbOut, p.WantLog)
return nil
})
if err != nil {
t.Fatal(err)
}
}
err := tstest.WaitFor(5*time.Second, func() error {
if p.WantKubeSecret != nil {
got := env.kube.Secret()
if diff := cmp.Diff(got, p.WantKubeSecret); diff != "" {
@@ -1180,6 +1316,16 @@ func TestContainerBoot(t *testing.T) {
if err != nil {
t.Fatalf("test: %q phase %d: %v", name, i, err)
}
// if we provide a wanted exit code, we expect that the process is finished,
// so should return from the test.
if p.WantExitCode != nil {
return
}
wantCmds = append(wantCmds, p.WantCmds...)
waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n"))
err = tstest.WaitFor(2*time.Second, func() error {
for path, want := range p.WantFiles {
gotBs, err := os.ReadFile(filepath.Join(env.d, path))
@@ -1393,6 +1539,13 @@ func (lc *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) {
default:
panic(fmt.Sprintf("unsupported method %q", r.Method))
}
// In the localAPI ServeHTTP method
case "/localapi/v0/disconnect-control":
if r.Method != "POST" {
panic(fmt.Sprintf("unsupported method %q", r.Method))
}
w.WriteHeader(http.StatusOK)
return
default:
panic(fmt.Sprintf("unsupported path %q", r.URL.Path))
}
@@ -1591,7 +1744,11 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs)))
}
for key, val := range req.Data {
k.secret[key] = string(val)
if val == nil {
delete(k.secret, key)
} else {
k.secret[key] = string(val)
}
}
default:
panic(fmt.Sprintf("unknown content type %q", r.Header.Get("Content-Type")))
@@ -1659,7 +1816,7 @@ func newTestEnv(t *testing.T) testEnv {
kube.Start(t)
t.Cleanup(kube.Close)
tailscaledConf := &ipn.ConfigVAlpha{AuthKey: new("foo"), Version: "alpha0"}
tailscaledConf := &ipn.ConfigVAlpha{AuthKey: new(configFileAuthKey), Version: "alpha0"}
serveConf := ipn.ServeConfig{TCP: map[uint16]*ipn.TCPPortHandler{80: {HTTP: true}}}
serveConfWithServices := ipn.ServeConfig{
TCP: map[uint16]*ipn.TCPPortHandler{80: {HTTP: true}},