control/controlclient,ipn/ipnlocal,wgengine: avoid restarting wireguard when key is learned via tsmp (#19142)

When disco keys are learned on a node that is connected to control and
has a mapSession, wgengine will see the key as having changed, and
assume that any existing connections will need to be reset.

For keys learned via TSMP, the connection should not be reset as that
key is learned via an active wireguard connection. If wgengine resets
that connetion, a 15s timeout will occur.

This change adds a map to track new keys coming in via TSMP, and removes
them from the list of keys that needs to trigger wireguard resets. This
is done with an interface chain from controlclient down via localBackend
to userspaceEngine via the watchdog.

Once a key has been actively used for preventing a wireguard reset, the
key is removed from the map.

If mapSession becomes a long lived process instead of being dependent on
having a connection to control. This interface chain can be removed, and
the event sequence from wrap->controlClient->userspaceEngine, can be
changed to wrap->userspaceEngine->controlClient as we know the map will
not be gunked up with stale TSMP entries.

Updates #12639

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl
2026-03-30 14:26:08 -04:00
committed by GitHub
parent 99f8039101
commit bf467727fc
8 changed files with 323 additions and 23 deletions
+36 -1
View File
@@ -121,7 +121,8 @@ type userspaceEngine struct {
birdClient BIRDClient // or nil
controlKnobs *controlknobs.Knobs // or nil
testMaybeReconfigHook func() // for tests; if non-nil, fires if maybeReconfigWireguardLocked called
testMaybeReconfigHook func() // for tests; if non-nil, fires if maybeReconfigWireguardLocked called
testDiscoChangedHook func(map[key.NodePublic]bool) // for tests; if non-nil, fires after assembling discoChanged map
// isLocalAddr reports the whether an IP is assigned to the local
// tunnel interface. It's used to reflect local packets
@@ -167,6 +168,10 @@ type userspaceEngine struct {
// networkLogger logs statistics about network connections.
networkLogger netlog.Logger
// tsmpLearnedDisco tracks per node key if a peer disco key was learned via TSMP.
// wgLock must be held when using this map.
tsmpLearnedDisco map[key.NodePublic]key.DiscoPublic
// Lock ordering: magicsock.Conn.mu, wgLock, then mu.
}
@@ -1028,6 +1033,12 @@ func (e *userspaceEngine) ResetAndStop() (*Status, error) {
}
}
func (e *userspaceEngine) PatchDiscoKey(pub key.NodePublic, disco key.DiscoPublic) {
e.wgLock.Lock()
defer e.wgLock.Unlock()
mak.Set(&e.tsmpLearnedDisco, pub, disco)
}
func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, dnsCfg *dns.Config) error {
if routerCfg == nil {
panic("routerCfg must not be nil")
@@ -1119,14 +1130,31 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
if p.DiscoKey.IsZero() {
continue
}
// If the key changed, mark the connection for reconfiguration.
pub := p.PublicKey
if old, ok := prevEP[pub]; ok && old != p.DiscoKey {
// If the disco key was learned via TSMP, we do not need to reset the
// wireguard config as the new key was received over an existing wireguard
// connection.
if discoTSMP, okTSMP := e.tsmpLearnedDisco[p.PublicKey]; okTSMP &&
discoTSMP == p.DiscoKey {
delete(e.tsmpLearnedDisco, p.PublicKey)
e.logf("wgengine: Skipping reconfig (TSMP key): %s changed from %q to %q", pub.ShortString(), old, p.DiscoKey)
continue
}
discoChanged[pub] = true
e.logf("wgengine: Reconfig: %s changed from %q to %q", pub.ShortString(), old, p.DiscoKey)
}
}
}
// For tests, what disco connections needs to be changed.
if e.testDiscoChangedHook != nil {
e.testDiscoChangedHook(discoChanged)
}
e.lastCfgFull = *cfg.Clone()
// Tell magicsock about the new (or initial) private key
@@ -1144,6 +1172,13 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
return err
}
// Cleanup map of tsmp marks for peers that no longer exists in config.
for nodeKey := range e.tsmpLearnedDisco {
if !peerSet.Contains(nodeKey) {
delete(e.tsmpLearnedDisco, nodeKey)
}
}
// Shutdown the network logger because the IDs changed.
// Let it be started back up by subsequent logic.
if buildfeatures.HasNetLog && netLogIDsChanged && e.networkLogger.Running() {