wgengine/userspace: do not fall back to old key on tsmpLearned mismatch (#19575)

The mismatch behaviour of falling back to a previous key could end up
breaking connections when the netmap update took longer than the 2
seconds allowed in controlClient.auto for netmap updates, or if the
controlClient context was canceled. This could end up breaking
legitimate updates to the netmap for disco keys coming from control.

Instead, log the event, and let the connection be reset to that of the
key as that is safer.

Issue found by @bradfitz.

Updates #19574

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl
2026-04-29 13:23:04 -04:00
committed by GitHub
parent fd6ae2fad4
commit be7cce74ba
2 changed files with 18 additions and 22 deletions
+13 -18
View File
@@ -1176,31 +1176,26 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
// wireguard config as the new key was received over an existing wireguard // wireguard config as the new key was received over an existing wireguard
// connection. // connection.
if discoTSMP, okTSMP := e.tsmpLearnedDisco[p.PublicKey]; okTSMP { if discoTSMP, okTSMP := e.tsmpLearnedDisco[p.PublicKey]; okTSMP {
if discoTSMP == p.DiscoKey {
// Key matches, remove entry from map. // Key matches, remove entry from map.
delete(e.tsmpLearnedDisco, p.PublicKey)
if discoTSMP == p.DiscoKey {
e.logf("wgengine: Skipping reconfig (TSMP key): %s changed from %q to %q", e.logf("wgengine: Skipping reconfig (TSMP key): %s changed from %q to %q",
pub.ShortString(), old, p.DiscoKey) pub.ShortString(), old, p.DiscoKey)
delete(e.tsmpLearnedDisco, p.PublicKey) // Skip session clear.
} else { continue
}
// The new disco key does not match what we received via // The new disco key does not match what we received via
// TSMP for this peer. This is unexpected, so log it. // TSMP for this peer. This is unexpected, though possible
// If it does happen, overwrite the previously-saved // if processing a change in a large netmap ends up taking
// disco key with the new one for now: We expect another // longer than the 2 second timeout in
// update must be pending in that case, so keep the map // [controlClient.mapRoutineState.UpdateNetmapDelta], or if
// entry. // the context is cancelled mid update. Log the event, and reset
// The reason why this should never happen is that only a single // the connection as it is possibly a stale entry in the map
// request is coming through the netmap pipeline at a time, and there // instead of a TSMP disco key update that led us here.
// should realistically ever only be a single entry in the map. This
// is really a belt and suspenders solution to find usage that is
// inconsistent with our expectations.
e.logf("wgengine: [unexpected] Reconfig: using TSMP key for %s (control stale): tsmp=%q control=%q old=%q", e.logf("wgengine: [unexpected] Reconfig: using TSMP key for %s (control stale): tsmp=%q control=%q old=%q",
pub.ShortString(), discoTSMP, p.DiscoKey, old) pub.ShortString(), discoTSMP, p.DiscoKey, old)
metricTSMPLearnedKeyMismatch.Add(1) metricTSMPLearnedKeyMismatch.Add(1)
p.DiscoKey = discoTSMP
}
// Skip session clear no matter what.
continue
} }
discoChanged[pub] = true discoChanged[pub] = true
+3 -2
View File
@@ -264,8 +264,9 @@ func TestUserspaceEngineTSMPLearnedMismatch(t *testing.T) {
wrongKey bool wrongKey bool
}{ }{
{tsmp: false, inMap: false, wrongKey: false}, {tsmp: false, inMap: false, wrongKey: false},
{tsmp: true, inMap: false, wrongKey: true}, {tsmp: true, inMap: false, wrongKey: false},
{tsmp: false, inMap: false, wrongKey: false}, {tsmp: true, inMap: true, wrongKey: true},
{tsmp: false, inMap: true, wrongKey: false},
} }
nkHex := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" nkHex := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"