From d3ba1480f5c23def55578f9cc086a0d2c6b9a1a9 Mon Sep 17 00:00:00 2001 From: Alex Valiushko Date: Wed, 15 Apr 2026 19:22:07 -0700 Subject: [PATCH] magicsock: invalidate endpoint on trust timeout (#19415) Endpoint's best address was cleared on trustBestAddrUntil expiry only if it was a udprelay connection. This generalizes invalidation to also cover direct UDP. Trust deadline is checked in two cases: On disco ping timeout from the endpoint's best address. Traffic goes DERP-only, heartbeats to the old address stop. The discovery pings are still in flight, handled by the following. On disco ping success from an alternative. BestAddr switches to the working path, trust refreshed, eager discovery stops. The still in flight pongs are handled by betterAddr(). Updates #19407 Change-Id: Ic41ed18edb4a6e4350a2d49271ba01566a6a6964 Signed-off-by: Alex Valiushko --- wgengine/magicsock/endpoint.go | 11 +- wgengine/magicsock/endpoint_test.go | 234 ++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+), 8 deletions(-) diff --git a/wgengine/magicsock/endpoint.go b/wgengine/magicsock/endpoint.go index b8d3b96be..31dcab67b 100644 --- a/wgengine/magicsock/endpoint.go +++ b/wgengine/magicsock/endpoint.go @@ -1183,8 +1183,7 @@ func (de *endpoint) discoPingTimeout(txid stun.TxID) { return } bestUntrusted := mono.Now().After(de.trustBestAddrUntil) - if sp.to == de.bestAddr.epAddr && sp.to.vni.IsSet() && bestUntrusted { - // TODO(jwhited): consider applying this to direct UDP paths as well + if sp.to == de.bestAddr.epAddr && bestUntrusted { de.clearBestAddrLocked() } if debugDisco() || !de.bestAddr.ap.IsValid() || bestUntrusted { @@ -1778,12 +1777,8 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src epAdd latency: latency, wireMTU: pingSizeToPktLen(sp.size, sp.to), } - // TODO(jwhited): consider checking de.trustBestAddrUntil as well. If - // de.bestAddr is untrusted we may want to clear it, otherwise we could - // get stuck with a forever untrusted bestAddr that blackholes, since - // we don't clear direct UDP paths on disco ping timeout (see - // discoPingTimeout). - if betterAddr(thisPong, de.bestAddr) { + bestUntrusted := now.After(de.trustBestAddrUntil) + if betterAddr(thisPong, de.bestAddr) || bestUntrusted { de.c.logf("magicsock: disco: node %v %v now using %v mtu=%v tx=%x", de.publicKey.ShortString(), de.discoShort(), sp.to, thisPong.wireMTU, m.TxID[:6]) de.debugUpdates.Add(EndpointChange{ When: time.Now(), diff --git a/wgengine/magicsock/endpoint_test.go b/wgengine/magicsock/endpoint_test.go index eba742b00..593cf1455 100644 --- a/wgengine/magicsock/endpoint_test.go +++ b/wgengine/magicsock/endpoint_test.go @@ -6,12 +6,16 @@ package magicsock import ( "net/netip" "testing" + "testing/synctest" "time" + "tailscale.com/disco" "tailscale.com/net/packet" + "tailscale.com/net/stun" "tailscale.com/tailcfg" "tailscale.com/tstime/mono" "tailscale.com/types/key" + "tailscale.com/util/ringlog" ) func TestProbeUDPLifetimeConfig_Equals(t *testing.T) { @@ -453,3 +457,233 @@ func Test_endpoint_udpRelayEndpointReady(t *testing.T) { }) } } + +func Test_endpoint_discoPingTimeout(t *testing.T) { + expired := -1 * time.Hour + valid := 1 * time.Hour + directAddrA := epAddr{ap: netip.MustParseAddrPort("192.0.2.1:7")} + relayAddrA := epAddr{ap: netip.MustParseAddrPort("192.0.2.2:77")} + relayAddrA.vni.Set(1) + directAddrB := epAddr{ap: netip.MustParseAddrPort("192.0.2.3:7")} + relayAddrB := epAddr{ap: netip.MustParseAddrPort("192.0.2.4:77")} + relayAddrB.vni.Set(1) + + for _, tc := range []struct { + name string + bestAddr addrQuality + trustBestAddrUntil time.Duration + pingTo epAddr + wantBestAddrCleared bool + }{ + { + name: "relay-path-trust-expired", + bestAddr: addrQuality{epAddr: relayAddrA}, + trustBestAddrUntil: expired, + pingTo: relayAddrA, + wantBestAddrCleared: true, + }, + { + name: "direct-udp-path-trust-expired", + bestAddr: addrQuality{epAddr: directAddrA}, + trustBestAddrUntil: expired, + pingTo: directAddrA, + wantBestAddrCleared: true, + }, + { + name: "direct-udp-path-trust-valid", + bestAddr: addrQuality{epAddr: directAddrA}, + trustBestAddrUntil: valid, + pingTo: directAddrA, + wantBestAddrCleared: false, + }, + { + name: "relay-path-trust-valid", + bestAddr: addrQuality{epAddr: relayAddrA}, + trustBestAddrUntil: valid, + pingTo: relayAddrA, + wantBestAddrCleared: false, + }, + { + name: "ping-to-different-direct-addr-trust-expired", + bestAddr: addrQuality{epAddr: directAddrA}, + trustBestAddrUntil: expired, + pingTo: directAddrB, + wantBestAddrCleared: false, + }, + { + name: "ping-to-different-relay-addr-trust-expired", + bestAddr: addrQuality{epAddr: relayAddrA}, + trustBestAddrUntil: expired, + pingTo: relayAddrB, + wantBestAddrCleared: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + now := mono.Now() // synctest to match this to the internal 'now' + c := &Conn{ + logf: func(msg string, args ...any) {}, + } + c.discoAtomic.Set(key.NewDisco()) + de := &endpoint{ + c: c, + bestAddr: tc.bestAddr, + trustBestAddrUntil: now.Add(tc.trustBestAddrUntil), + sentPing: make(map[stun.TxID]sentPing), + } + txid := stun.NewTxID() + timer := time.NewTimer(time.Hour) + timer.Stop() + de.sentPing[txid] = sentPing{ + to: tc.pingTo, + at: now.Add(-100 * time.Millisecond), + timer: timer, + purpose: pingDiscovery, + } + + de.discoPingTimeout(txid) + if tc.wantBestAddrCleared { + if de.bestAddr.ap.IsValid() { + t.Errorf("expected bestAddr to be cleared, but bestAddr.ap is valid: %v", de.bestAddr.ap) + } + if de.trustBestAddrUntil != 0 { + t.Errorf("expected trustBestAddrUntil to be cleared, but got: %v", de.trustBestAddrUntil) + } + } else { + if de.bestAddr != tc.bestAddr { + t.Errorf("expected bestAddr to be unchanged, got: %v, want: %v", de.bestAddr, tc.bestAddr) + } + } + if _, ok := de.sentPing[txid]; ok { + t.Errorf("expected sentPing[txid] to be removed, but it still exists") + } + }) + }) + } +} + +func Test_endpoint_handlePongConnLocked(t *testing.T) { + goodLatency := 50 * time.Millisecond + badLatency := 100 * time.Millisecond + expired := -1 * time.Hour + valid := 1 * time.Hour + directAddrA := epAddr{ap: netip.MustParseAddrPort("192.0.2.1:7")} + directAddrB := epAddr{ap: netip.MustParseAddrPort("192.0.2.2:8")} + derpAddr := epAddr{ap: netip.AddrPortFrom(tailcfg.DerpMagicIPAddr, 0)} + + for _, tc := range []struct { + name string + bestAddr addrQuality + trustBestAddrUntil time.Duration + pongFrom epAddr + pongLatency time.Duration + wantBestAddr epAddr + }{ + { + name: "better-latency-trust-valid", + bestAddr: addrQuality{epAddr: directAddrA, latency: badLatency}, + trustBestAddrUntil: valid, + pongFrom: directAddrB, + pongLatency: goodLatency, + wantBestAddr: directAddrB, + }, + { + name: "worse-latency-trust-valid", + bestAddr: addrQuality{epAddr: directAddrA, latency: goodLatency}, + trustBestAddrUntil: valid, + pongFrom: directAddrB, + pongLatency: badLatency, + wantBestAddr: directAddrA, + }, + { + name: "worse-latency-trust-expired", + bestAddr: addrQuality{epAddr: directAddrA, latency: goodLatency}, + trustBestAddrUntil: expired, + pongFrom: directAddrB, + pongLatency: badLatency, + wantBestAddr: directAddrB, + }, + { + name: "same-path-trust-expired", + bestAddr: addrQuality{epAddr: directAddrA, latency: badLatency}, + trustBestAddrUntil: expired, + pongFrom: directAddrA, + pongLatency: goodLatency, // updated latency + wantBestAddr: directAddrA, + }, + { + name: "derp-pong-trust-expired", + bestAddr: addrQuality{epAddr: directAddrA, latency: badLatency}, + trustBestAddrUntil: expired, + pongFrom: derpAddr, + pongLatency: goodLatency, + wantBestAddr: directAddrA, + }, + { + name: "better-latency-trust-expired", + bestAddr: addrQuality{epAddr: directAddrA, latency: badLatency}, + trustBestAddrUntil: expired, + pongFrom: directAddrB, + pongLatency: goodLatency, + wantBestAddr: directAddrB, + }, + } { + t.Run(tc.name, func(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + now := mono.Now() // synctest to match this to the internal 'now' + pm := newPeerMap() + c := &Conn{ + logf: func(msg string, args ...any) {}, + peerMap: pm, + } + c.discoAtomic.Set(key.NewDisco()) + de := &endpoint{ + c: c, + bestAddr: tc.bestAddr, + bestAddrAt: now.Add(-5 * time.Minute), + trustBestAddrUntil: now.Add(tc.trustBestAddrUntil), + sentPing: make(map[stun.TxID]sentPing), + endpointState: make(map[netip.AddrPort]*endpointState), + debugUpdates: ringlog.New[EndpointChange](10), + } + txid := stun.NewTxID() + pong := &disco.Pong{ + TxID: txid, + Src: tc.pongFrom.ap, + } + timer := time.NewTimer(time.Hour) + timer.Stop() + de.sentPing[txid] = sentPing{ + to: tc.pongFrom, + at: now.Add(-tc.pongLatency), + timer: timer, + purpose: pingDiscovery, + } + if tc.pongFrom.ap.Addr() != tailcfg.DerpMagicIPAddr && !tc.pongFrom.vni.IsSet() { + de.endpointState[tc.pongFrom.ap] = &endpointState{} + } + di := &discoInfo{ + discoKey: key.NewDisco().Public(), + discoShort: "test", + } + + knownTxID := de.handlePongConnLocked(pong, di, tc.pongFrom) + if !knownTxID { + t.Errorf("expected knownTxID to be true, got false") + } + if de.bestAddr.epAddr != tc.wantBestAddr { + t.Errorf("expected bestAddr.epAddr to be %v, got: %v", tc.wantBestAddr, de.bestAddr.epAddr) + } + if tc.pongFrom == tc.bestAddr.epAddr && de.bestAddr.latency-tc.pongLatency > 0 { + t.Errorf("expected latency to be %v, got: %v", tc.pongLatency, de.bestAddr.latency) + } + if tc.pongFrom != derpAddr && de.trustBestAddrUntil.Before(now) { + t.Errorf("expected trustBestAddrUntil to be refreshed, but it's in the past: %v", de.trustBestAddrUntil) + } + if _, ok := de.sentPing[txid]; ok { + t.Errorf("expected sentPing[txid] to be removed, but it still exists") + } + }) + }) + } +}