control/controlclient,ipn/ipnlocal,wgengine: avoid restarting wireguard when key is learned via tsmp (#19142)
When disco keys are learned on a node that is connected to control and has a mapSession, wgengine will see the key as having changed, and assume that any existing connections will need to be reset. For keys learned via TSMP, the connection should not be reset as that key is learned via an active wireguard connection. If wgengine resets that connetion, a 15s timeout will occur. This change adds a map to track new keys coming in via TSMP, and removes them from the list of keys that needs to trigger wireguard resets. This is done with an interface chain from controlclient down via localBackend to userspaceEngine via the watchdog. Once a key has been actively used for preventing a wireguard reset, the key is removed from the map. If mapSession becomes a long lived process instead of being dependent on having a connection to control. This interface chain can be removed, and the event sequence from wrap->controlClient->userspaceEngine, can be changed to wrap->userspaceEngine->controlClient as we know the map will not be gunked up with stale TSMP entries. Updates #12639 Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
@@ -478,6 +478,27 @@ func (mrs mapRoutineState) UpdateNetmapDelta(muts []netmap.NodeMutation) bool {
|
||||
return err == nil && ok
|
||||
}
|
||||
|
||||
var _ patchDiscoKeyer = mapRoutineState{}
|
||||
|
||||
func (mrs mapRoutineState) PatchDiscoKey(pub key.NodePublic, disco key.DiscoPublic) {
|
||||
c := mrs.c
|
||||
c.mu.Lock()
|
||||
goodState := c.loggedIn && c.inMapPoll
|
||||
dun, ok := c.observer.(patchDiscoKeyer)
|
||||
c.mu.Unlock()
|
||||
|
||||
if !goodState || !ok {
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(c.mapCtx, 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
c.observerQueue.RunSync(ctx, func() {
|
||||
dun.PatchDiscoKey(pub, disco)
|
||||
})
|
||||
}
|
||||
|
||||
// mapRoutine is responsible for keeping a read-only streaming connection to the
|
||||
// control server, and keeping the netmap up to date.
|
||||
func (c *Auto) mapRoutine() {
|
||||
|
||||
@@ -228,6 +228,15 @@ type NetmapDeltaUpdater interface {
|
||||
UpdateNetmapDelta([]netmap.NodeMutation) (ok bool)
|
||||
}
|
||||
|
||||
// patchDiscoKeyer is an optional interface that can be implemented by an [Observer] to be
|
||||
// notified about node disco keys received out-of-band from control, via
|
||||
// existing connection state.
|
||||
type patchDiscoKeyer interface {
|
||||
// PatchDiscoKey reports to the receiver that the specified disco key
|
||||
// for node was obtained out-of-band from control.
|
||||
PatchDiscoKey(key.NodePublic, key.DiscoPublic)
|
||||
}
|
||||
|
||||
var nextControlClientID atomic.Int64
|
||||
|
||||
// NewDirect returns a new Direct client.
|
||||
@@ -367,7 +376,7 @@ func NewDirect(opts Options) (*Direct, error) {
|
||||
// mapSession has gone away, we want to fall back to pushing the key
|
||||
// further down the chain.
|
||||
if err := c.streamingMapSession.updateDiscoForNode(
|
||||
peer.ID(), update.Key, time.Now(), false); err == nil ||
|
||||
peer.ID(), peer.Key(), update.Key, time.Now(), false); err == nil ||
|
||||
!errors.Is(err, ErrChangeQueueClosed) {
|
||||
return
|
||||
}
|
||||
@@ -377,10 +386,7 @@ func NewDirect(opts Options) (*Direct, error) {
|
||||
// not have a mapSession (we are not connected to control) or because the
|
||||
// mapSession queue has closed.
|
||||
c.logf("controlclient direct: updating discoKey for %v via magicsock", update.Src)
|
||||
discoKeyPub.Publish(events.PeerDiscoKeyUpdate{
|
||||
Src: update.Src,
|
||||
Key: update.Key,
|
||||
})
|
||||
discoKeyPub.Publish(events.PeerDiscoKeyUpdate(update))
|
||||
})
|
||||
|
||||
return c, nil
|
||||
@@ -859,8 +865,10 @@ func (c *Direct) PollNetMap(ctx context.Context, nu NetmapUpdater) error {
|
||||
// update it observed. It is used by tests and [NetmapFromMapResponseForDebug].
|
||||
// It will report only the first netmap seen.
|
||||
type rememberLastNetmapUpdater struct {
|
||||
last *netmap.NetworkMap
|
||||
done chan any
|
||||
last *netmap.NetworkMap
|
||||
lastTSMPKey key.NodePublic
|
||||
lastTSMPDisco key.DiscoPublic
|
||||
done chan any
|
||||
}
|
||||
|
||||
func (nu *rememberLastNetmapUpdater) UpdateFullNetmap(nm *netmap.NetworkMap) {
|
||||
@@ -871,6 +879,11 @@ func (nu *rememberLastNetmapUpdater) UpdateFullNetmap(nm *netmap.NetworkMap) {
|
||||
}
|
||||
}
|
||||
|
||||
func (nu *rememberLastNetmapUpdater) PatchDiscoKey(key key.NodePublic, disco key.DiscoPublic) {
|
||||
nu.lastTSMPKey = key
|
||||
nu.lastTSMPDisco = disco
|
||||
}
|
||||
|
||||
// FetchNetMapForTest fetches the netmap once.
|
||||
func (c *Direct) FetchNetMapForTest(ctx context.Context) (*netmap.NetworkMap, error) {
|
||||
var nu rememberLastNetmapUpdater
|
||||
|
||||
@@ -37,6 +37,11 @@ import (
|
||||
"tailscale.com/wgengine/filter"
|
||||
)
|
||||
|
||||
type responseWithSource struct {
|
||||
response *tailcfg.MapResponse
|
||||
viaTSMP bool
|
||||
}
|
||||
|
||||
// mapSession holds the state over a long-polled "map" request to the
|
||||
// control plane.
|
||||
//
|
||||
@@ -98,7 +103,7 @@ type mapSession struct {
|
||||
lastTKAInfo *tailcfg.TKAInfo
|
||||
lastNetmapSummary string // from NetworkMap.VeryConcise
|
||||
cqmu sync.Mutex
|
||||
changeQueue chan (*tailcfg.MapResponse)
|
||||
changeQueue chan responseWithSource
|
||||
changeQueueClosed bool
|
||||
processQueue sync.WaitGroup
|
||||
}
|
||||
@@ -123,7 +128,7 @@ func newMapSession(privateNodeKey key.NodePrivate, nu NetmapUpdater, controlKnob
|
||||
cancel: func() {},
|
||||
onDebug: func(context.Context, *tailcfg.Debug) error { return nil },
|
||||
onSelfNodeChanged: func(*netmap.NetworkMap) {},
|
||||
changeQueue: make(chan *tailcfg.MapResponse),
|
||||
changeQueue: make(chan responseWithSource),
|
||||
changeQueueClosed: false,
|
||||
}
|
||||
ms.sessionAliveCtx, ms.sessionAliveCtxClose = context.WithCancel(context.Background())
|
||||
@@ -142,7 +147,7 @@ func (ms *mapSession) run() {
|
||||
for {
|
||||
select {
|
||||
case change := <-ms.changeQueue:
|
||||
ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change)
|
||||
ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change.response, change.viaTSMP)
|
||||
case <-ms.sessionAliveCtx.Done():
|
||||
// Drain any remaining items in the queue before exiting.
|
||||
// Lock the queue during this time to avoid updates through other channels
|
||||
@@ -154,7 +159,7 @@ func (ms *mapSession) run() {
|
||||
for {
|
||||
select {
|
||||
case change := <-ms.changeQueue:
|
||||
ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change)
|
||||
ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change.response, change.viaTSMP)
|
||||
default:
|
||||
// Queue is empty, close it and exit
|
||||
close(ms.changeQueue)
|
||||
@@ -190,7 +195,7 @@ func (ms *mapSession) Close() {
|
||||
|
||||
var ErrChangeQueueClosed = errors.New("change queue closed")
|
||||
|
||||
func (ms *mapSession) updateDiscoForNode(id tailcfg.NodeID, key key.DiscoPublic, lastSeen time.Time, online bool) error {
|
||||
func (ms *mapSession) updateDiscoForNode(id tailcfg.NodeID, key key.NodePublic, discoKey key.DiscoPublic, lastSeen time.Time, online bool) error {
|
||||
ms.cqmu.Lock()
|
||||
|
||||
if ms.changeQueueClosed {
|
||||
@@ -199,13 +204,17 @@ func (ms *mapSession) updateDiscoForNode(id tailcfg.NodeID, key key.DiscoPublic,
|
||||
return ErrChangeQueueClosed
|
||||
}
|
||||
|
||||
resp := &tailcfg.MapResponse{
|
||||
PeersChangedPatch: []*tailcfg.PeerChange{{
|
||||
NodeID: id,
|
||||
LastSeen: &lastSeen,
|
||||
Online: &online,
|
||||
DiscoKey: &key,
|
||||
}},
|
||||
resp := responseWithSource{
|
||||
response: &tailcfg.MapResponse{
|
||||
PeersChangedPatch: []*tailcfg.PeerChange{{
|
||||
NodeID: id,
|
||||
Key: &key,
|
||||
LastSeen: &lastSeen,
|
||||
Online: &online,
|
||||
DiscoKey: &discoKey,
|
||||
}},
|
||||
},
|
||||
viaTSMP: true,
|
||||
}
|
||||
ms.changeQueue <- resp
|
||||
ms.cqmu.Unlock()
|
||||
@@ -221,7 +230,12 @@ func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *t
|
||||
return ErrChangeQueueClosed
|
||||
}
|
||||
|
||||
ms.changeQueue <- resp
|
||||
change := responseWithSource{
|
||||
response: resp,
|
||||
viaTSMP: false,
|
||||
}
|
||||
|
||||
ms.changeQueue <- change
|
||||
ms.cqmu.Unlock()
|
||||
return nil
|
||||
}
|
||||
@@ -234,7 +248,7 @@ func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *t
|
||||
//
|
||||
// TODO(bradfitz): make this handle all fields later. For now (2023-08-20) this
|
||||
// is [re]factoring progress enough.
|
||||
func (ms *mapSession) handleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse) error {
|
||||
func (ms *mapSession) handleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse, viaTSMP bool) error {
|
||||
if debug := resp.Debug; debug != nil {
|
||||
if err := ms.onDebug(ctx, debug); err != nil {
|
||||
return err
|
||||
@@ -284,6 +298,13 @@ func (ms *mapSession) handleNonKeepAliveMapResponse(ctx context.Context, resp *t
|
||||
|
||||
ms.updateStateFromResponse(resp)
|
||||
|
||||
// If source was learned via TSMP, the updated disco key need to be marked in
|
||||
// userspaceEngine as an update that should not reconfigure the wireguard
|
||||
// connection.
|
||||
if viaTSMP {
|
||||
ms.tryMarkDiscoAsLearnedFromTSMP(resp)
|
||||
}
|
||||
|
||||
if ms.tryHandleIncrementally(resp) {
|
||||
ms.occasionallyPrintSummary(ms.lastNetmapSummary)
|
||||
return nil
|
||||
@@ -312,6 +333,21 @@ func (ms *mapSession) handleNonKeepAliveMapResponse(ctx context.Context, resp *t
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ms *mapSession) tryMarkDiscoAsLearnedFromTSMP(res *tailcfg.MapResponse) {
|
||||
dun, ok := ms.netmapUpdater.(patchDiscoKeyer)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// In reality we should never really have more than one change here over TSMP.
|
||||
for _, change := range res.PeersChangedPatch {
|
||||
if change == nil || change.DiscoKey == nil || change.Key == nil {
|
||||
continue
|
||||
}
|
||||
dun.PatchDiscoKey(*change.Key, *change.DiscoKey)
|
||||
}
|
||||
}
|
||||
|
||||
// upgradeNode upgrades Node fields from the server into the modern forms
|
||||
// not using deprecated fields.
|
||||
func upgradeNode(n *tailcfg.Node) {
|
||||
|
||||
@@ -33,7 +33,9 @@ import (
|
||||
"tailscale.com/util/eventbus/eventbustest"
|
||||
"tailscale.com/util/mak"
|
||||
"tailscale.com/util/must"
|
||||
"tailscale.com/util/usermetric"
|
||||
"tailscale.com/util/zstdframe"
|
||||
"tailscale.com/wgengine"
|
||||
)
|
||||
|
||||
func eps(s ...string) []netip.AddrPort {
|
||||
@@ -678,6 +680,7 @@ func TestUpdateDiscoForNode(t *testing.T) {
|
||||
// Insert existing node
|
||||
node := tailcfg.Node{
|
||||
ID: 1,
|
||||
Key: key.NewNode().Public(),
|
||||
DiscoKey: oldKey.Public(),
|
||||
Online: &tt.initialOnline,
|
||||
LastSeen: &tt.initialLastSeen,
|
||||
@@ -690,7 +693,7 @@ func TestUpdateDiscoForNode(t *testing.T) {
|
||||
}
|
||||
|
||||
newKey := key.NewDisco()
|
||||
ms.updateDiscoForNode(node.ID, newKey.Public(), tt.updateLastSeen, tt.updateOnline)
|
||||
ms.updateDiscoForNode(node.ID, node.Key, newKey.Public(), tt.updateLastSeen, tt.updateOnline)
|
||||
<-nu.done
|
||||
|
||||
nm := ms.netmap()
|
||||
@@ -707,6 +710,82 @@ func TestUpdateDiscoForNode(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateDiscoForNodeCallback(t *testing.T) {
|
||||
t.Run("key_wired_through_to_updater", func(t *testing.T) {
|
||||
nu := &rememberLastNetmapUpdater{
|
||||
done: make(chan any, 1),
|
||||
}
|
||||
ms := newTestMapSession(t, nu)
|
||||
|
||||
oldKey := key.NewDisco()
|
||||
|
||||
// Insert existing node
|
||||
node := tailcfg.Node{
|
||||
ID: 1,
|
||||
Key: key.NewNode().Public(),
|
||||
DiscoKey: oldKey.Public(),
|
||||
Online: new(false),
|
||||
LastSeen: new(time.Unix(1, 0)),
|
||||
}
|
||||
|
||||
if nm := ms.netmapForResponse(&tailcfg.MapResponse{
|
||||
Peers: []*tailcfg.Node{&node},
|
||||
}); len(nm.Peers) != 1 {
|
||||
t.Fatalf("node not inserted")
|
||||
}
|
||||
|
||||
newKey := key.NewDisco()
|
||||
ms.updateDiscoForNode(node.ID, node.Key, newKey.Public(), time.Now(), false)
|
||||
<-nu.done
|
||||
|
||||
if nu.lastTSMPKey != node.Key || nu.lastTSMPDisco != newKey.Public() {
|
||||
t.Fatalf("expected [%s]=%s, got [%s]=%s", node.Key, newKey.Public(),
|
||||
nu.lastTSMPKey, nu.lastTSMPDisco)
|
||||
}
|
||||
})
|
||||
t.Run("key_not_wired_through_to_updater", func(t *testing.T) {
|
||||
nu := &rememberLastNetmapUpdater{
|
||||
done: make(chan any, 1),
|
||||
}
|
||||
ms := newTestMapSession(t, nu)
|
||||
|
||||
oldKey := key.NewDisco()
|
||||
|
||||
// Insert existing node
|
||||
node := tailcfg.Node{
|
||||
ID: 1,
|
||||
Key: key.NewNode().Public(),
|
||||
DiscoKey: oldKey.Public(),
|
||||
Online: new(false),
|
||||
LastSeen: new(time.Unix(1, 0)),
|
||||
}
|
||||
|
||||
if nm := ms.netmapForResponse(&tailcfg.MapResponse{
|
||||
Peers: []*tailcfg.Node{&node},
|
||||
}); len(nm.Peers) != 1 {
|
||||
t.Fatalf("node not inserted")
|
||||
}
|
||||
|
||||
newKey := key.NewDisco().Public()
|
||||
resp := &tailcfg.MapResponse{
|
||||
PeersChangedPatch: []*tailcfg.PeerChange{{
|
||||
NodeID: node.ID,
|
||||
Key: &node.Key,
|
||||
LastSeen: new(time.Now()),
|
||||
Online: new(true),
|
||||
DiscoKey: &newKey,
|
||||
}},
|
||||
}
|
||||
ms.HandleNonKeepAliveMapResponse(t.Context(), resp)
|
||||
<-nu.done
|
||||
|
||||
if !nu.lastTSMPKey.IsZero() || !nu.lastTSMPDisco.IsZero() {
|
||||
t.Fatalf("expected zero keys, got [%s]=%s",
|
||||
nu.lastTSMPKey, nu.lastTSMPDisco)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func first[T any](s []T) T {
|
||||
if len(s) == 0 {
|
||||
var zero T
|
||||
@@ -1568,3 +1647,22 @@ func TestLearnZstdOfKeepAlive(t *testing.T) {
|
||||
t.Fatalf("got %d zstd decodes; want %d", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPathDiscokeyerImplementations(t *testing.T) {
|
||||
bus := eventbustest.NewBus(t)
|
||||
ht := health.NewTracker(bus)
|
||||
reg := new(usermetric.Registry)
|
||||
e, err := wgengine.NewFakeUserspaceEngine(t.Logf, 0, ht, reg, bus)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Cleanup(e.Close)
|
||||
if _, ok := e.(patchDiscoKeyer); !ok {
|
||||
t.Error("wgengine.userspaceEngine must implement patchDiscoKeyer")
|
||||
}
|
||||
|
||||
wd := wgengine.NewWatchdog(e)
|
||||
if _, ok := wd.(patchDiscoKeyer); !ok {
|
||||
t.Error("wgengine.watchdogEngine must implement patchDiscoKeyer")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user