wgengine/magicsock,control/controlclient: do not overwrite discokey with old key (#18606)

When a client starts up without being able to connect to control, it sends its discoKey to other nodes it wants to communicate with over TSMP. This disco key will be a newer key than the one control knows about. If the client that can connect to control gets a full netmap, ensure that the disco key for the node not connected to control is not overwritten with the stale key control knows about. This is implemented through keeping track of mapSession and use that for the discokey injection if it is available. This ensures that we are not constantly resetting the wireguard connection when getting the wrong keys from control. This is implemented as: - If the key is received via TSMP: - Set lastSeen for the peer to now() - Set online for the peer to false - When processing new keys, only accept keys where either: - Peer is online - lastSeen is newer than existing last seen If mapSession is not available, as in we are not yet connected to control, punt down the disco key injection to magicsock. Ideally, we will want to have mapSession be long lived at some point in the near future so we only need to inject keys in one location and then also use that for testing and loading the cache, but that is a yak for another PR. Updates #12639 Signed-off-by: Claus Lensbøl <claus@tailscale.com>
2026-03-20 08:56:27 -04:00
parent ca9aa20255
commit 85bb5f84a5
15 changed files with 346 additions and 46 deletions
@@ -9,6 +9,7 @@ import (
 	"crypto/sha256"
 	"encoding/hex"
 	"encoding/json"
+	"errors"
 	"io"
 	"maps"
 	"net"
@@ -96,6 +97,10 @@ type mapSession struct {
 	lastPopBrowserURL      string
 	lastTKAInfo            *tailcfg.TKAInfo
 	lastNetmapSummary      string // from NetworkMap.VeryConcise
+	cqmu                   sync.Mutex
+	changeQueue            chan (*tailcfg.MapResponse)
+	changeQueueClosed      bool
+	processQueue           sync.WaitGroup
 }

 // newMapSession returns a mostly unconfigured new mapSession.
@@ -118,11 +123,48 @@ func newMapSession(privateNodeKey key.NodePrivate, nu NetmapUpdater, controlKnob
 		cancel:            func() {},
 		onDebug:           func(context.Context, *tailcfg.Debug) error { return nil },
 		onSelfNodeChanged: func(*netmap.NetworkMap) {},
+		changeQueue:       make(chan *tailcfg.MapResponse),
+		changeQueueClosed: false,
 	}
 	ms.sessionAliveCtx, ms.sessionAliveCtxClose = context.WithCancel(context.Background())
+	ms.processQueue.Add(1)
+	go ms.run()
 	return ms
 }

+// run starts the mapSession processing a queue of tailcfg.MapResponse one by
+// one until close() is called on the mapSession.
+// When the mapSession is closed, the remaining queue is locked and processed
+// before the mapSession is done processing.
+func (ms *mapSession) run() {
+	defer ms.processQueue.Done()
+
+	for {
+		select {
+		case change := <-ms.changeQueue:
+			ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change)
+		case <-ms.sessionAliveCtx.Done():
+			// Drain any remaining items in the queue before exiting.
+			// Lock the queue during this time to avoid updates through other channels
+			// to be overwritten. This is especially relevant for calls to
+			// updateDiscoForNode.
+			ms.cqmu.Lock()
+			ms.changeQueueClosed = true
+			ms.cqmu.Unlock()
+			for {
+				select {
+				case change := <-ms.changeQueue:
+					ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change)
+				default:
+					// Queue is empty, close it and exit
+					close(ms.changeQueue)
+					return
+				}
+			}
+		}
+	}
+}
+
 // occasionallyPrintSummary logs summary at most once very 5 minutes. The
 // summary is the Netmap.VeryConcise result from the last received map response.
 func (ms *mapSession) occasionallyPrintSummary(summary string) {
@@ -143,9 +185,48 @@ func (ms *mapSession) clock() tstime.Clock {

 func (ms *mapSession) Close() {
 	ms.sessionAliveCtxClose()
+	ms.processQueue.Wait()
 }

-// HandleNonKeepAliveMapResponse handles a non-KeepAlive MapResponse (full or
+var ErrChangeQueueClosed = errors.New("change queue closed")
+
+func (ms *mapSession) updateDiscoForNode(id tailcfg.NodeID, key key.DiscoPublic, lastSeen time.Time, online bool) error {
+	ms.cqmu.Lock()
+
+	if ms.changeQueueClosed {
+		ms.cqmu.Unlock()
+		ms.processQueue.Wait()
+		return ErrChangeQueueClosed
+	}
+
+	resp := &tailcfg.MapResponse{
+		PeersChangedPatch: []*tailcfg.PeerChange{{
+			NodeID:   id,
+			LastSeen: &lastSeen,
+			Online:   &online,
+			DiscoKey: &key,
+		}},
+	}
+	ms.changeQueue <- resp
+	ms.cqmu.Unlock()
+	return nil
+}
+
+func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse) error {
+	ms.cqmu.Lock()
+
+	if ms.changeQueueClosed {
+		ms.cqmu.Unlock()
+		ms.processQueue.Wait()
+		return ErrChangeQueueClosed
+	}
+
+	ms.changeQueue <- resp
+	ms.cqmu.Unlock()
+	return nil
+}
+
+// handleNonKeepAliveMapResponse handles a non-KeepAlive MapResponse (full or
 // incremental).
 //
 // All fields that are valid on a KeepAlive MapResponse have already been
@@ -153,7 +234,7 @@ func (ms *mapSession) Close() {
 //
 // TODO(bradfitz): make this handle all fields later. For now (2023-08-20) this
 // is [re]factoring progress enough.
-func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse) error {
+func (ms *mapSession) handleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse) error {
 	if debug := resp.Debug; debug != nil {
 		if err := ms.onDebug(ctx, debug); err != nil {
 			return err
@@ -199,6 +280,8 @@ func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *t

 	ms.patchifyPeersChanged(resp)

+	ms.removeUnwantedDiscoUpdates(resp)
+
 	ms.updateStateFromResponse(resp)

 	if ms.tryHandleIncrementally(resp) {
@@ -281,6 +364,48 @@ type updateStats struct {
 	changed int
 }

+// removeUnwantedDiscoUpdates goes over the patchified updates and reject items
+// where the node is offline and has last been seen before the recorded last seen.
+func (ms *mapSession) removeUnwantedDiscoUpdates(resp *tailcfg.MapResponse) {
+	existingMap := ms.netmap()
+	acceptedDiscoUpdates := resp.PeersChangedPatch[:0]
+
+	for _, change := range resp.PeersChangedPatch {
+		// Accept if:
+		// - DiscoKey is nil and did not change.
+		// - Fields we rely on for rejection is missing.
+		if change.DiscoKey == nil || change.Online == nil || change.LastSeen == nil {
+			acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
+			continue
+		}
+
+		// Accept if:
+		// - Node is online.
+		if *change.Online {
+			acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
+			continue
+		}
+
+		peerIdx := existingMap.PeerIndexByNodeID(change.NodeID)
+		// Accept if:
+		// - Cannot find the peer, don't have enough data
+		if peerIdx < 0 {
+			acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
+			continue
+		}
+		existingNode := existingMap.Peers[peerIdx]
+
+		// Accept if:
+		// - lastSeen moved forward in time.
+		if existingLastSeen, ok := existingNode.LastSeen().GetOk(); ok &&
+			change.LastSeen.After(existingLastSeen) {
+			acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
+		}
+	}
+
+	resp.PeersChangedPatch = acceptedDiscoUpdates
+}
+
 // updateStateFromResponse updates ms from res. It takes ownership of res.
 func (ms *mapSession) updateStateFromResponse(resp *tailcfg.MapResponse) {
 	ms.updatePeersStateFromResponse(resp)