cmd/{containerboot,k8s-operator}: use state Secret for checking device auth (#16328)

Previously, the operator checked the ProxyGroup status fields for
information on how many of the proxies had successfully authed. Use
their state Secrets instead as a more reliable source of truth.

containerboot has written device_fqdn and device_ips keys to the
state Secret since inception, and pod_uid since 1.78.0, so there's
no need to use the API for that data. Read it from the state Secret
for consistency. However, to ensure we don't read data from a
previous run of containerboot, make sure we reset containerboot's
state keys on startup.

One other knock-on effect of that is ProxyGroups can briefly be
marked not Ready while a Pod is restarting. Introduce a new
ProxyGroupAvailable condition to more accurately reflect
when downstream controllers can implement flows that rely on a
ProxyGroup having at least 1 proxy Pod running.

Fixes #16327

Change-Id: I026c18e9d23e87109a471a87b8e4fb6271716a66

Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
Tom Proctor
2025-06-27 18:10:04 +01:00
committed by GitHub
parent f81baa2d56
commit 711698f5a9
19 changed files with 373 additions and 202 deletions
+32 -10
View File
@@ -18,12 +18,15 @@ import (
"time"
"tailscale.com/ipn"
"tailscale.com/kube/egressservices"
"tailscale.com/kube/ingressservices"
"tailscale.com/kube/kubeapi"
"tailscale.com/kube/kubeclient"
"tailscale.com/kube/kubetypes"
"tailscale.com/logtail/backoff"
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
"tailscale.com/util/set"
)
// kubeClient is a wrapper around Tailscale's internal kube client that knows how to talk to the kube API server. We use
@@ -117,20 +120,39 @@ func (kc *kubeClient) deleteAuthKey(ctx context.Context) error {
return nil
}
// storeCapVerUID stores the current capability version of tailscale and, if provided, UID of the Pod in the tailscale
// state Secret.
// These two fields are used by the Kubernetes Operator to observe the current capability version of tailscaled running in this container.
func (kc *kubeClient) storeCapVerUID(ctx context.Context, podUID string) error {
capVerS := fmt.Sprintf("%d", tailcfg.CurrentCapabilityVersion)
d := map[string][]byte{
kubetypes.KeyCapVer: []byte(capVerS),
// resetContainerbootState resets state from previous runs of containerboot to
// ensure the operator doesn't use stale state when a Pod is first recreated.
func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string) error {
existingSecret, err := kc.GetSecret(ctx, kc.stateSecret)
if err != nil {
return fmt.Errorf("failed to read state Secret %q to reset state: %w", kc.stateSecret, err)
}
s := &kubeapi.Secret{
Data: map[string][]byte{
kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion),
},
}
if podUID != "" {
d[kubetypes.KeyPodUID] = []byte(podUID)
s.Data[kubetypes.KeyPodUID] = []byte(podUID)
}
s := &kubeapi.Secret{
Data: d,
toClear := set.SetOf([]string{
kubetypes.KeyDeviceID,
kubetypes.KeyDeviceFQDN,
kubetypes.KeyDeviceIPs,
kubetypes.KeyHTTPSEndpoint,
egressservices.KeyEgressServices,
ingressservices.IngressConfigKey,
})
for key := range existingSecret.Data {
if toClear.Contains(key) {
// It's fine to leave the key in place as a debugging breadcrumb,
// it should get a new value soon.
s.Data[key] = nil
}
}
return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container")
}
+80
View File
@@ -8,13 +8,18 @@ package main
import (
"context"
"errors"
"fmt"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"tailscale.com/ipn"
"tailscale.com/kube/egressservices"
"tailscale.com/kube/ingressservices"
"tailscale.com/kube/kubeapi"
"tailscale.com/kube/kubeclient"
"tailscale.com/kube/kubetypes"
"tailscale.com/tailcfg"
)
func TestSetupKube(t *testing.T) {
@@ -238,3 +243,78 @@ func TestWaitForConsistentState(t *testing.T) {
t.Fatalf("expected nil, got %v", err)
}
}
func TestResetContainerbootState(t *testing.T) {
capver := fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion)
for name, tc := range map[string]struct {
podUID string
initial map[string][]byte
expected map[string][]byte
}{
"empty_initial": {
podUID: "1234",
initial: map[string][]byte{},
expected: map[string][]byte{
kubetypes.KeyCapVer: capver,
kubetypes.KeyPodUID: []byte("1234"),
},
},
"empty_initial_no_pod_uid": {
initial: map[string][]byte{},
expected: map[string][]byte{
kubetypes.KeyCapVer: capver,
},
},
"only_relevant_keys_updated": {
podUID: "1234",
initial: map[string][]byte{
kubetypes.KeyCapVer: []byte("1"),
kubetypes.KeyPodUID: []byte("5678"),
kubetypes.KeyDeviceID: []byte("device-id"),
kubetypes.KeyDeviceFQDN: []byte("device-fqdn"),
kubetypes.KeyDeviceIPs: []byte(`["192.0.2.1"]`),
kubetypes.KeyHTTPSEndpoint: []byte("https://example.com"),
egressservices.KeyEgressServices: []byte("egress-services"),
ingressservices.IngressConfigKey: []byte("ingress-config"),
"_current-profile": []byte("current-profile"),
"_machinekey": []byte("machine-key"),
"_profiles": []byte("profiles"),
"_serve_e0ce": []byte("serve-e0ce"),
"profile-e0ce": []byte("profile-e0ce"),
},
expected: map[string][]byte{
kubetypes.KeyCapVer: capver,
kubetypes.KeyPodUID: []byte("1234"),
// Cleared keys.
kubetypes.KeyDeviceID: nil,
kubetypes.KeyDeviceFQDN: nil,
kubetypes.KeyDeviceIPs: nil,
kubetypes.KeyHTTPSEndpoint: nil,
egressservices.KeyEgressServices: nil,
ingressservices.IngressConfigKey: nil,
// Tailscaled keys not included in patch.
},
},
} {
t.Run(name, func(t *testing.T) {
var actual map[string][]byte
kc := &kubeClient{stateSecret: "foo", Client: &kubeclient.FakeClient{
GetSecretImpl: func(context.Context, string) (*kubeapi.Secret, error) {
return &kubeapi.Secret{
Data: tc.initial,
}, nil
},
StrategicMergePatchSecretImpl: func(ctx context.Context, name string, secret *kubeapi.Secret, _ string) error {
actual = secret.Data
return nil
},
}}
if err := kc.resetContainerbootState(context.Background(), tc.podUID); err != nil {
t.Fatalf("resetContainerbootState() error = %v", err)
}
if diff := cmp.Diff(tc.expected, actual); diff != "" {
t.Errorf("resetContainerbootState() mismatch (-want +got):\n%s", diff)
}
})
}
}
+8 -11
View File
@@ -188,6 +188,14 @@ func run() error {
if err := cfg.setupKube(bootCtx, kc); err != nil {
return fmt.Errorf("error setting up for running on Kubernetes: %w", err)
}
// Clear out any state from previous runs of containerboot. Check
// hasKubeStateStore because although we know we're in kube, that
// doesn't guarantee the state store is properly configured.
if hasKubeStateStore(cfg) {
if err := kc.resetContainerbootState(bootCtx, cfg.PodUID); err != nil {
return fmt.Errorf("error clearing previous state from Secret: %w", err)
}
}
}
client, daemonProcess, err := startTailscaled(bootCtx, cfg)
@@ -367,11 +375,6 @@ authLoop:
if err := client.SetServeConfig(ctx, new(ipn.ServeConfig)); err != nil {
return fmt.Errorf("failed to unset serve config: %w", err)
}
if hasKubeStateStore(cfg) {
if err := kc.storeHTTPSEndpoint(ctx, ""); err != nil {
return fmt.Errorf("failed to update HTTPS endpoint in tailscale state: %w", err)
}
}
}
if hasKubeStateStore(cfg) && isTwoStepConfigAuthOnce(cfg) {
@@ -384,12 +387,6 @@ authLoop:
}
}
if hasKubeStateStore(cfg) {
if err := kc.storeCapVerUID(ctx, cfg.PodUID); err != nil {
return fmt.Errorf("storing capability version and UID: %w", err)
}
}
w, err = client.WatchIPNBus(ctx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState)
if err != nil {
return fmt.Errorf("rewatching tailscaled for updates after auth: %w", err)
+69 -63
View File
@@ -460,6 +460,7 @@ func TestContainerBoot(t *testing.T) {
Env: map[string]string{
"KUBERNETES_SERVICE_HOST": env.kube.Host,
"KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port,
"POD_UID": "some-pod-uid",
},
KubeSecret: map[string]string{
"authkey": "tskey-key",
@@ -471,17 +472,20 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"authkey": "tskey-key",
kubetypes.KeyCapVer: capver,
kubetypes.KeyPodUID: "some-pod-uid",
},
},
{
Notify: runningNotify,
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
"tailscale_capver": capver,
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
kubetypes.KeyCapVer: capver,
kubetypes.KeyPodUID: "some-pod-uid",
},
},
},
@@ -554,7 +558,8 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"authkey": "tskey-key",
kubetypes.KeyCapVer: capver,
},
},
{
@@ -565,7 +570,8 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"authkey": "tskey-key",
kubetypes.KeyCapVer: capver,
},
},
{
@@ -574,10 +580,10 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock set --accept-dns=false",
},
WantKubeSecret: map[string]string{
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
"tailscale_capver": capver,
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
kubetypes.KeyCapVer: capver,
},
},
},
@@ -599,17 +605,18 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"authkey": "tskey-key",
kubetypes.KeyCapVer: capver,
},
},
{
Notify: runningNotify,
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
"tailscale_capver": capver,
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
kubetypes.KeyCapVer: capver,
},
},
{
@@ -624,11 +631,11 @@ func TestContainerBoot(t *testing.T) {
},
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"device_fqdn": "new-name.test.ts.net",
"device_id": "newID",
"device_ips": `["100.64.0.1"]`,
"tailscale_capver": capver,
"authkey": "tskey-key",
"device_fqdn": "new-name.test.ts.net",
"device_id": "newID",
"device_ips": `["100.64.0.1"]`,
kubetypes.KeyCapVer: capver,
},
},
},
@@ -912,18 +919,19 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"authkey": "tskey-key",
kubetypes.KeyCapVer: capver,
},
},
{
Notify: runningNotify,
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
"https_endpoint": "no-https",
"tailscale_capver": capver,
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
"https_endpoint": "no-https",
kubetypes.KeyCapVer: capver,
},
},
},
@@ -947,7 +955,8 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"authkey": "tskey-key",
kubetypes.KeyCapVer: capver,
},
EndpointStatuses: map[string]int{
egressSvcTerminateURL(env.localAddrPort): 200,
@@ -956,12 +965,12 @@ func TestContainerBoot(t *testing.T) {
{
Notify: runningNotify,
WantKubeSecret: map[string]string{
"egress-services": mustBase64(t, egressStatus),
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
"tailscale_capver": capver,
"egress-services": string(mustJSON(t, egressStatus)),
"authkey": "tskey-key",
"device_fqdn": "test-node.test.ts.net",
"device_id": "myID",
"device_ips": `["100.64.0.1"]`,
kubetypes.KeyCapVer: capver,
},
EndpointStatuses: map[string]int{
egressSvcTerminateURL(env.localAddrPort): 200,
@@ -1002,7 +1011,8 @@ func TestContainerBoot(t *testing.T) {
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"authkey": "tskey-key",
kubetypes.KeyCapVer: capver,
},
},
{
@@ -1016,10 +1026,11 @@ func TestContainerBoot(t *testing.T) {
// Missing "_current-profile" key.
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"_machinekey": "foo",
"_profiles": "foo",
"profile-baff": "foo",
"authkey": "tskey-key",
"_machinekey": "foo",
"_profiles": "foo",
"profile-baff": "foo",
kubetypes.KeyCapVer: capver,
},
WantLog: "Waiting for tailscaled to finish writing state to Secret \"tailscale\"",
},
@@ -1029,11 +1040,12 @@ func TestContainerBoot(t *testing.T) {
"_current-profile": "foo",
},
WantKubeSecret: map[string]string{
"authkey": "tskey-key",
"_machinekey": "foo",
"_profiles": "foo",
"profile-baff": "foo",
"_current-profile": "foo",
"authkey": "tskey-key",
"_machinekey": "foo",
"_profiles": "foo",
"profile-baff": "foo",
"_current-profile": "foo",
kubetypes.KeyCapVer: capver,
},
WantLog: "HTTP server at [::]:9002 closed",
WantExitCode: ptr.To(0),
@@ -1061,7 +1073,7 @@ func TestContainerBoot(t *testing.T) {
fmt.Sprintf("TS_TEST_SOCKET=%s", env.lapi.Path),
fmt.Sprintf("TS_SOCKET=%s", env.runningSockPath),
fmt.Sprintf("TS_TEST_ONLY_ROOT=%s", env.d),
fmt.Sprint("TS_TEST_FAKE_NETFILTER=true"),
"TS_TEST_FAKE_NETFILTER=true",
}
for k, v := range tc.Env {
cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v))
@@ -1489,10 +1501,7 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
}
switch r.Header.Get("Content-Type") {
case "application/json-patch+json":
req := []struct {
Op string `json:"op"`
Path string `json:"path"`
}{}
req := []kubeclient.JSONPatch{}
if err := json.Unmarshal(bs, &req); err != nil {
panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs)))
}
@@ -1503,23 +1512,20 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
panic(fmt.Sprintf("unsupported json-patch path %q", op.Path))
}
delete(k.secret, strings.TrimPrefix(op.Path, "/data/"))
case "replace":
case "add", "replace":
path, ok := strings.CutPrefix(op.Path, "/data/")
if !ok {
panic(fmt.Sprintf("unsupported json-patch path %q", op.Path))
}
req := make([]kubeclient.JSONPatch, 0)
if err := json.Unmarshal(bs, &req); err != nil {
panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs)))
val, ok := op.Value.(string)
if !ok {
panic(fmt.Sprintf("unsupported json patch value %v: cannot be converted to string", op.Value))
}
for _, patch := range req {
val, ok := patch.Value.(string)
if !ok {
panic(fmt.Sprintf("unsupported json patch value %v: cannot be converted to string", patch.Value))
}
k.secret[path] = val
v, err := base64.StdEncoding.DecodeString(val)
if err != nil {
panic(fmt.Sprintf("json patch value %q is not base64 encoded: %v", val, err))
}
k.secret[path] = string(v)
default:
panic(fmt.Sprintf("unsupported json-patch op %q", op.Op))
}