wgengine/magicsock,control/controlclient: do not overwrite discokey with old key (#18606)

When a client starts up without being able to connect to control, it
sends its discoKey to other nodes it wants to communicate with over
TSMP. This disco key will be a newer key than the one control knows
about.

If the client that can connect to control gets a full netmap, ensure
that the disco key for the node not connected to control is not
overwritten with the stale key control knows about.

This is implemented through keeping track of mapSession and use that for
the discokey injection if it is available. This ensures that we are not
constantly resetting the wireguard connection when getting the wrong
keys from control.

This is implemented as:
 - If the key is received via TSMP:
   - Set lastSeen for the peer to now()
   - Set online for the peer to false
 - When processing new keys, only accept keys where either:
   - Peer is online
   - lastSeen is newer than existing last seen

If mapSession is not available, as in we are not yet connected to
control, punt down the disco key injection to magicsock.

Ideally, we will want to have mapSession be long lived at some point in
the near future so we only need to inject keys in one location and then
also use that for testing and loading the cache, but that is a yak for
another PR.

Updates #12639

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl
2026-03-20 08:56:27 -04:00
committed by GitHub
parent ca9aa20255
commit 85bb5f84a5
15 changed files with 346 additions and 46 deletions
+127 -2
View File
@@ -9,6 +9,7 @@ import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"io"
"maps"
"net"
@@ -96,6 +97,10 @@ type mapSession struct {
lastPopBrowserURL string
lastTKAInfo *tailcfg.TKAInfo
lastNetmapSummary string // from NetworkMap.VeryConcise
cqmu sync.Mutex
changeQueue chan (*tailcfg.MapResponse)
changeQueueClosed bool
processQueue sync.WaitGroup
}
// newMapSession returns a mostly unconfigured new mapSession.
@@ -118,11 +123,48 @@ func newMapSession(privateNodeKey key.NodePrivate, nu NetmapUpdater, controlKnob
cancel: func() {},
onDebug: func(context.Context, *tailcfg.Debug) error { return nil },
onSelfNodeChanged: func(*netmap.NetworkMap) {},
changeQueue: make(chan *tailcfg.MapResponse),
changeQueueClosed: false,
}
ms.sessionAliveCtx, ms.sessionAliveCtxClose = context.WithCancel(context.Background())
ms.processQueue.Add(1)
go ms.run()
return ms
}
// run starts the mapSession processing a queue of tailcfg.MapResponse one by
// one until close() is called on the mapSession.
// When the mapSession is closed, the remaining queue is locked and processed
// before the mapSession is done processing.
func (ms *mapSession) run() {
defer ms.processQueue.Done()
for {
select {
case change := <-ms.changeQueue:
ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change)
case <-ms.sessionAliveCtx.Done():
// Drain any remaining items in the queue before exiting.
// Lock the queue during this time to avoid updates through other channels
// to be overwritten. This is especially relevant for calls to
// updateDiscoForNode.
ms.cqmu.Lock()
ms.changeQueueClosed = true
ms.cqmu.Unlock()
for {
select {
case change := <-ms.changeQueue:
ms.handleNonKeepAliveMapResponse(ms.sessionAliveCtx, change)
default:
// Queue is empty, close it and exit
close(ms.changeQueue)
return
}
}
}
}
}
// occasionallyPrintSummary logs summary at most once very 5 minutes. The
// summary is the Netmap.VeryConcise result from the last received map response.
func (ms *mapSession) occasionallyPrintSummary(summary string) {
@@ -143,9 +185,48 @@ func (ms *mapSession) clock() tstime.Clock {
func (ms *mapSession) Close() {
ms.sessionAliveCtxClose()
ms.processQueue.Wait()
}
// HandleNonKeepAliveMapResponse handles a non-KeepAlive MapResponse (full or
var ErrChangeQueueClosed = errors.New("change queue closed")
func (ms *mapSession) updateDiscoForNode(id tailcfg.NodeID, key key.DiscoPublic, lastSeen time.Time, online bool) error {
ms.cqmu.Lock()
if ms.changeQueueClosed {
ms.cqmu.Unlock()
ms.processQueue.Wait()
return ErrChangeQueueClosed
}
resp := &tailcfg.MapResponse{
PeersChangedPatch: []*tailcfg.PeerChange{{
NodeID: id,
LastSeen: &lastSeen,
Online: &online,
DiscoKey: &key,
}},
}
ms.changeQueue <- resp
ms.cqmu.Unlock()
return nil
}
func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse) error {
ms.cqmu.Lock()
if ms.changeQueueClosed {
ms.cqmu.Unlock()
ms.processQueue.Wait()
return ErrChangeQueueClosed
}
ms.changeQueue <- resp
ms.cqmu.Unlock()
return nil
}
// handleNonKeepAliveMapResponse handles a non-KeepAlive MapResponse (full or
// incremental).
//
// All fields that are valid on a KeepAlive MapResponse have already been
@@ -153,7 +234,7 @@ func (ms *mapSession) Close() {
//
// TODO(bradfitz): make this handle all fields later. For now (2023-08-20) this
// is [re]factoring progress enough.
func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse) error {
func (ms *mapSession) handleNonKeepAliveMapResponse(ctx context.Context, resp *tailcfg.MapResponse) error {
if debug := resp.Debug; debug != nil {
if err := ms.onDebug(ctx, debug); err != nil {
return err
@@ -199,6 +280,8 @@ func (ms *mapSession) HandleNonKeepAliveMapResponse(ctx context.Context, resp *t
ms.patchifyPeersChanged(resp)
ms.removeUnwantedDiscoUpdates(resp)
ms.updateStateFromResponse(resp)
if ms.tryHandleIncrementally(resp) {
@@ -281,6 +364,48 @@ type updateStats struct {
changed int
}
// removeUnwantedDiscoUpdates goes over the patchified updates and reject items
// where the node is offline and has last been seen before the recorded last seen.
func (ms *mapSession) removeUnwantedDiscoUpdates(resp *tailcfg.MapResponse) {
existingMap := ms.netmap()
acceptedDiscoUpdates := resp.PeersChangedPatch[:0]
for _, change := range resp.PeersChangedPatch {
// Accept if:
// - DiscoKey is nil and did not change.
// - Fields we rely on for rejection is missing.
if change.DiscoKey == nil || change.Online == nil || change.LastSeen == nil {
acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
continue
}
// Accept if:
// - Node is online.
if *change.Online {
acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
continue
}
peerIdx := existingMap.PeerIndexByNodeID(change.NodeID)
// Accept if:
// - Cannot find the peer, don't have enough data
if peerIdx < 0 {
acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
continue
}
existingNode := existingMap.Peers[peerIdx]
// Accept if:
// - lastSeen moved forward in time.
if existingLastSeen, ok := existingNode.LastSeen().GetOk(); ok &&
change.LastSeen.After(existingLastSeen) {
acceptedDiscoUpdates = append(acceptedDiscoUpdates, change)
}
}
resp.PeersChangedPatch = acceptedDiscoUpdates
}
// updateStateFromResponse updates ms from res. It takes ownership of res.
func (ms *mapSession) updateStateFromResponse(resp *tailcfg.MapResponse) {
ms.updatePeersStateFromResponse(resp)