cmd/k8s-operator: migrate to tailscale-client-go-v2 (#19010)

This commit modifies the kubernetes operator to use the `tailscale-client-go-v2`
package instead of the internal tailscale client it was previously using. This
now gives us the ability to expand out custom resources and features as they
become available via the API module.

The tailnet reconciler has also been modified to manage clients as tailnets
are created and removed, providing each subsequent reconciler with a single
`ClientProvider` that obtains a tailscale client for the respective tailnet
by name, or the operator's default when presented with a blank string.

Fixes: https://github.com/tailscale/corp/issues/38418

Signed-off-by: David Bond <davidsbond93@gmail.com>
This commit is contained in:
David Bond
2026-04-09 14:39:46 +01:00
committed by GitHub
parent b25920dfc0
commit 85d6ba9473
33 changed files with 916 additions and 940 deletions
+33 -58
View File
@@ -10,7 +10,6 @@ import (
"encoding/json"
"errors"
"fmt"
"net/http"
"net/netip"
"slices"
"sort"
@@ -33,11 +32,12 @@ import (
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"tailscale.com/client/tailscale/v2"
"tailscale.com/client/tailscale"
"tailscale.com/ipn"
tsoperator "tailscale.com/k8s-operator"
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
"tailscale.com/k8s-operator/tsclient"
"tailscale.com/kube/egressservices"
"tailscale.com/kube/k8s-proxy/conf"
"tailscale.com/kube/kubetypes"
@@ -85,7 +85,7 @@ type ProxyGroupReconciler struct {
log *zap.SugaredLogger
recorder record.EventRecorder
clock tstime.Clock
tsClient tsClient
clients ClientProvider
// User-specified defaults from the helm installation.
tsNamespace string
@@ -122,7 +122,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
return reconcile.Result{}, fmt.Errorf("failed to get tailscale.com ProxyGroup: %w", err)
}
tailscaleClient, loginUrl, err := r.getClientAndLoginURL(ctx, pg.Spec.Tailnet)
tsClient, err := r.clients.For(pg.Spec.Tailnet)
if err != nil {
oldPGStatus := pg.Status.DeepCopy()
nrr := &notReadyReason{
@@ -141,7 +141,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
return reconcile.Result{}, nil
}
if done, err := r.maybeCleanup(ctx, tailscaleClient, pg); err != nil {
if done, err := r.maybeCleanup(ctx, tsClient, pg); err != nil {
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
logger.Infof("optimistic lock error, retrying: %s", err)
return reconcile.Result{}, nil
@@ -160,7 +160,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
}
oldPGStatus := pg.Status.DeepCopy()
staticEndpoints, nrr, err := r.reconcilePG(ctx, tailscaleClient, loginUrl, pg, logger)
staticEndpoints, nrr, err := r.reconcilePG(ctx, tsClient, pg, logger)
return reconcile.Result{}, errors.Join(err, r.maybeUpdateStatus(ctx, logger, pg, oldPGStatus, nrr, staticEndpoints))
}
@@ -168,7 +168,7 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
// for deletion. It is separated out from Reconcile to make a clear separation
// between reconciling the ProxyGroup, and posting the status of its created
// resources onto the ProxyGroup status field.
func (r *ProxyGroupReconciler) reconcilePG(ctx context.Context, tailscaleClient tsClient, loginUrl string, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger) (map[string][]netip.AddrPort, *notReadyReason, error) {
func (r *ProxyGroupReconciler) reconcilePG(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, logger *zap.SugaredLogger) (map[string][]netip.AddrPort, *notReadyReason, error) {
if !slices.Contains(pg.Finalizers, FinalizerName) {
// This log line is printed exactly once during initial provisioning,
// because once the finalizer is in place this block gets skipped. So,
@@ -209,7 +209,7 @@ func (r *ProxyGroupReconciler) reconcilePG(ctx context.Context, tailscaleClient
return notReady(reasonProxyGroupInvalid, fmt.Sprintf("invalid ProxyGroup spec: %v", err))
}
staticEndpoints, nrr, err := r.maybeProvision(ctx, tailscaleClient, loginUrl, pg, proxyClass)
staticEndpoints, nrr, err := r.maybeProvision(ctx, tsClient, pg, proxyClass)
if err != nil {
return nil, nrr, err
}
@@ -295,7 +295,7 @@ func (r *ProxyGroupReconciler) validate(ctx context.Context, pg *tsapi.ProxyGrou
return errors.Join(errs...)
}
func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClient tsClient, loginUrl string, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (map[string][]netip.AddrPort, *notReadyReason, error) {
func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (map[string][]netip.AddrPort, *notReadyReason, error) {
logger := r.logger(pg.Name)
r.mu.Lock()
r.ensureStateAddedForProxyGroup(pg)
@@ -317,7 +317,7 @@ func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClie
}
}
staticEndpoints, err := r.ensureConfigSecretsCreated(ctx, tailscaleClient, loginUrl, pg, proxyClass, svcToNodePorts)
staticEndpoints, err := r.ensureConfigSecretsCreated(ctx, tsClient, pg, proxyClass, svcToNodePorts)
if err != nil {
if _, ok := errors.AsType[*FindStaticEndpointErr](err); ok {
reason := reasonProxyGroupCreationFailed
@@ -428,7 +428,7 @@ func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClie
return r.notReadyErrf(pg, logger, "error reconciling metrics resources: %w", err)
}
if err := r.cleanupDanglingResources(ctx, tailscaleClient, pg, proxyClass); err != nil {
if err := r.cleanupDanglingResources(ctx, tsClient, pg, proxyClass); err != nil {
return r.notReadyErrf(pg, logger, "error cleaning up dangling resources: %w", err)
}
@@ -625,7 +625,7 @@ func (r *ProxyGroupReconciler) ensureNodePortServiceCreated(ctx context.Context,
// cleanupDanglingResources ensures we don't leak config secrets, state secrets, and
// tailnet devices when the number of replicas specified is reduced.
func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) error {
func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) error {
logger := r.logger(pg.Name)
metadata, err := getNodeMetadata(ctx, pg, r.Client, r.tsNamespace)
if err != nil {
@@ -639,7 +639,7 @@ func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tai
// Dangling resource, delete the config + state Secrets, as well as
// deleting the device from the tailnet.
if err := r.ensureDeviceDeleted(ctx, tailscaleClient, m.tsID, logger); err != nil {
if err := r.ensureDeviceDeleted(ctx, tsClient, m.tsID, logger); err != nil {
return err
}
if err := r.Delete(ctx, m.stateSecret); err != nil && !apierrors.IsNotFound(err) {
@@ -682,7 +682,7 @@ func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tai
// maybeCleanup just deletes the device from the tailnet. All the kubernetes
// resources linked to a ProxyGroup will get cleaned up via owner references
// (which we can use because they are all in the same namespace).
func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup) (bool, error) {
func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup) (bool, error) {
logger := r.logger(pg.Name)
metadata, err := getNodeMetadata(ctx, pg, r.Client, r.tsNamespace)
@@ -691,7 +691,7 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient
}
for _, m := range metadata {
if err := r.ensureDeviceDeleted(ctx, tailscaleClient, m.tsID, logger); err != nil {
if err := r.ensureDeviceDeleted(ctx, tsClient, m.tsID, logger); err != nil {
return false, err
}
}
@@ -712,25 +712,23 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient
return true, nil
}
func (r *ProxyGroupReconciler) ensureDeviceDeleted(ctx context.Context, tailscaleClient tsClient, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error {
func (r *ProxyGroupReconciler) ensureDeviceDeleted(ctx context.Context, tsClient tsclient.Client, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error {
logger.Debugf("deleting device %s from control", string(id))
if err := tailscaleClient.DeleteDevice(ctx, string(id)); err != nil {
if errResp, ok := errors.AsType[tailscale.ErrResponse](err); ok && errResp.Status == http.StatusNotFound {
logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id))
} else {
return fmt.Errorf("error deleting device: %w", err)
}
} else {
logger.Debugf("device %s deleted from control", string(id))
err := tsClient.Devices().Delete(ctx, string(id))
switch {
case tailscale.IsNotFound(err):
logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id))
case err != nil:
return fmt.Errorf("error deleting device: %w", err)
}
logger.Debugf("device %s deleted from control", string(id))
return nil
}
func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
ctx context.Context,
tailscaleClient tsClient,
loginUrl string,
tsClient tsclient.Client,
pg *tsapi.ProxyGroup,
proxyClass *tsapi.ProxyClass,
svcToNodePorts map[string]uint16,
@@ -756,7 +754,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
return nil, err
}
authKey, err := r.getAuthKey(ctx, tailscaleClient, pg, existingCfgSecret, i, logger)
authKey, err := r.getAuthKey(ctx, tsClient, pg, existingCfgSecret, i, logger)
if err != nil {
return nil, err
}
@@ -838,8 +836,8 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
}
}
if loginUrl != "" {
cfg.ServerURL = new(loginUrl)
if tsClient.LoginURL() != "" {
cfg.ServerURL = new(tsClient.LoginURL())
}
if proxyClass != nil && proxyClass.Spec.TailscaleConfig != nil {
@@ -867,7 +865,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
return nil, err
}
configs, err := pgTailscaledConfig(pg, loginUrl, proxyClass, i, authKey, endpoints[nodePortSvcName], existingAdvertiseServices)
configs, err := pgTailscaledConfig(pg, tsClient.LoginURL(), proxyClass, i, authKey, endpoints[nodePortSvcName], existingAdvertiseServices)
if err != nil {
return nil, fmt.Errorf("error creating tailscaled config: %w", err)
}
@@ -904,7 +902,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
// A new key is created if the config Secret doesn't exist yet, or if the
// proxy has requested a reissue via its state Secret. An existing key is
// retained while the device hasn't authed or a reissue is in progress.
func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, existingCfgSecret *corev1.Secret, ordinal int32, logger *zap.SugaredLogger) (*string, error) {
func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, existingCfgSecret *corev1.Secret, ordinal int32, logger *zap.SugaredLogger) (*string, error) {
// Get state Secret to check if it's already authed or has requested
// a fresh auth key.
stateSecret := &corev1.Secret{
@@ -931,7 +929,7 @@ func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient t
if !createAuthKey {
var err error
createAuthKey, err = r.shouldReissueAuthKey(ctx, tailscaleClient, pg, stateSecret, cfgAuthKey)
createAuthKey, err = r.shouldReissueAuthKey(ctx, tsClient, pg, stateSecret, cfgAuthKey)
if err != nil {
return nil, err
}
@@ -945,7 +943,7 @@ func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient t
if len(tags) == 0 {
tags = r.defaultTags
}
key, err := newAuthKey(ctx, tailscaleClient, tags)
key, err := newAuthKey(ctx, tsClient, tags)
if err != nil {
return nil, err
}
@@ -965,7 +963,7 @@ func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient t
// shouldReissueAuthKey returns true if the proxy needs a new auth key. It
// tracks in-flight reissues via authKeyReissuing to avoid duplicate API calls
// across reconciles.
func (r *ProxyGroupReconciler) shouldReissueAuthKey(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, stateSecret *corev1.Secret, cfgAuthKey *string) (shouldReissue bool, err error) {
func (r *ProxyGroupReconciler) shouldReissueAuthKey(ctx context.Context, tsClient tsclient.Client, pg *tsapi.ProxyGroup, stateSecret *corev1.Secret, cfgAuthKey *string) (shouldReissue bool, err error) {
r.mu.Lock()
reissuing := r.authKeyReissuing[stateSecret.Name]
r.mu.Unlock()
@@ -1017,7 +1015,7 @@ func (r *ProxyGroupReconciler) shouldReissueAuthKey(ctx context.Context, tailsca
r.log.Infof("Proxy failing to auth; attempting cleanup and new key")
if tsID := stateSecret.Data[kubetypes.KeyDeviceID]; len(tsID) > 0 {
id := tailcfg.StableNodeID(tsID)
if err := r.ensureDeviceDeleted(ctx, tailscaleClient, id, r.log); err != nil {
if err = r.ensureDeviceDeleted(ctx, tsClient, id, r.log); err != nil {
return false, err
}
}
@@ -1305,29 +1303,6 @@ func (r *ProxyGroupReconciler) getRunningProxies(ctx context.Context, pg *tsapi.
return devices, nil
}
// getClientAndLoginURL returns the appropriate Tailscale client and resolved login URL
// for the given tailnet name. If no tailnet is specified, returns the default client
// and login server. Applies fallback to the operator's login server if the tailnet
// doesn't specify a custom login URL.
func (r *ProxyGroupReconciler) getClientAndLoginURL(ctx context.Context, tailnetName string) (tsClient,
string, error) {
if tailnetName == "" {
return r.tsClient, r.loginServer, nil
}
tc, loginUrl, err := clientForTailnet(ctx, r.Client, r.tsNamespace, tailnetName)
if err != nil {
return nil, "", err
}
// Apply fallback if tailnet doesn't specify custom login URL
if loginUrl == "" {
loginUrl = r.loginServer
}
return tc, loginUrl, nil
}
type nodeMetadata struct {
ordinal int32
stateSecret *corev1.Secret