wgengine/magicsock,ipn/ipnlocal: store and load homeDERP from cache (#19491)

With netmap caching, the home DERP of the self node was neither saved to
the cache or loaded from it, making nodes not stick to a DERP when
starting without a connection to control.

Instead, make sure that when a cache is available, load that cache,
before looking for DERP servers. This is implemented by allowing a skip
of ReSTUN in setting the DERP map (we must have a DERP map before
setting the home DERP), so the DERP from cache will set itself and be
sticky until a connection to control is established.

Making DERP only change when connected to control is handled by existing
code from f072d017bd.

Updates #19490

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl
2026-04-29 10:24:09 -04:00
committed by GitHub
parent 1841a93ab2
commit 78627c132f
9 changed files with 493 additions and 20 deletions
+13 -1
View File
@@ -35,7 +35,19 @@ func (b *LocalBackend) writeNetmapToDiskLocked(nm *netmap.NetworkMap) error {
b.diskCache.cache = netmapcache.NewCache(netmapcache.FileStore(dir))
b.diskCache.dir = dir
}
return b.diskCache.cache.Store(b.currentNode().Context(), nm)
// Set the homeDERP on the self node before saving. The self node homeDERP is
// generally not used since the homeDERP for self is stored in magicsock, but
// to be able to load it during loading the cache, we use the existing field
// to save it.
// Make a shallow copy and mutate a copy of the selfNode.
nmCopy := *nm
selfNode := nm.SelfNode.AsStruct()
selfNode.HomeDERP = int(b.currentNode().homeDERP.Load())
nmCopy.SelfNode = selfNode.View()
return b.diskCache.cache.Store(b.currentNode().Context(), &nmCopy)
}
func (b *LocalBackend) loadDiskCacheLocked() (om *netmap.NetworkMap, ok bool) {
+229
View File
@@ -0,0 +1,229 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package ipnlocal
import (
"net/netip"
"testing"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/types/netmap"
"tailscale.com/util/eventbus"
"tailscale.com/wgengine/magicsock"
)
// newCacheTestNetmap returns a minimal valid netmap suitable for testing disk
// cache operations.
func newCacheTestNetmap() *netmap.NetworkMap {
return &netmap.NetworkMap{
SelfNode: (&tailcfg.Node{
Name: "test-node.ts.net",
User: tailcfg.UserID(1),
Addresses: []netip.Prefix{
netip.MustParsePrefix("100.64.0.1/32"),
},
}).View(),
UserProfiles: map[tailcfg.UserID]tailcfg.UserProfileView{
tailcfg.UserID(1): (&tailcfg.UserProfile{
LoginName: "user@example.com",
DisplayName: "Test User",
}).View(),
},
DERPMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {},
2: {},
3: {},
4: {},
5: {},
6: {},
7: {},
8: {},
9: {},
10: {},
11: {},
},
},
}
}
func TestWriteAndLoadHomeDERP(t *testing.T) {
b := newTestBackend(t)
nm := newCacheTestNetmap()
b.currentNode().SetNetMap(nm)
const wantDERP = 7
b.currentNode().homeDERP.Store(wantDERP)
b.mu.Lock()
defer b.mu.Unlock()
if err := b.writeNetmapToDiskLocked(nm); err != nil {
t.Fatalf("writeNetmapToDiskLocked: %v", err)
}
loaded, ok := b.loadDiskCacheLocked()
if !ok {
t.Fatal("loadDiskCacheLocked returned ok=false")
}
if !loaded.SelfNode.Valid() {
t.Fatal("loaded netmap SelfNode is invalid")
}
if got := loaded.SelfNode.HomeDERP(); got != wantDERP {
t.Errorf("loaded SelfNode.HomeDERP() = %d, want %d", got, wantDERP)
}
}
func TestOnHomeDERPUpdate(t *testing.T) {
t.Run("normal_derp_change", func(t *testing.T) {
b := newTestBackend(t)
done := make(chan struct{})
tstest.Replace(t, &testOnlyHomeDERPUpdate, func() { close(done) })
nm := newCacheTestNetmap()
b.currentNode().SetNetMap(nm)
// Publish a HomeDERPChanged event via the backend's event bus.
bus := b.Sys().Bus.Get()
ec := bus.Client("test.TestOnHomeDERPUpdate")
pub := eventbus.Publish[magicsock.HomeDERPChanged](ec)
const wantDERP = 11
pub.Publish(magicsock.HomeDERPChanged{Old: 0, New: wantDERP})
<-done
if got := b.currentNode().homeDERP.Load(); got != wantDERP {
t.Errorf("b.homeDERP = %d, want %d", got, wantDERP)
}
// Verify the value was persisted to the disk cache.
b.mu.Lock()
defer b.mu.Unlock()
loaded, ok := b.loadDiskCacheLocked()
if !ok {
t.Fatal("loadDiskCacheLocked returned ok=false after homeDERP update")
}
if got := loaded.SelfNode.HomeDERP(); got != wantDERP {
t.Errorf("cached SelfNode.HomeDERP() = %d, want %d", got, wantDERP)
}
})
t.Run("old_does_not_match", func(t *testing.T) {
b := newTestBackend(t)
done := make(chan struct{})
tstest.Replace(t, &testOnlyHomeDERPUpdate, func() { close(done) })
const setDERP = 11
const wantDERP = 4
nm := newCacheTestNetmap()
selfNode := nm.SelfNode.AsStruct()
selfNode.HomeDERP = wantDERP
nm.SelfNode = selfNode.View()
b.currentNode().SetNetMap(nm)
b.currentNode().homeDERP.Store(wantDERP)
// Write an initial cache entry so we can verify it is not overwritten.
b.mu.Lock()
if err := b.writeNetmapToDiskLocked(nm); err != nil {
b.mu.Unlock()
t.Fatalf("setup writeNetmapToDiskLocked: %v", err)
}
b.mu.Unlock()
// Publish a HomeDERPChanged event via the backend's event bus.
bus := b.Sys().Bus.Get()
ec := bus.Client("test.TestOnHomeDERPUpdate")
pub := eventbus.Publish[magicsock.HomeDERPChanged](ec)
pub.Publish(magicsock.HomeDERPChanged{Old: wantDERP + 1, New: setDERP})
<-done
if got := b.currentNode().homeDERP.Load(); got != wantDERP {
t.Errorf("b.homeDERP = %d, wanted no change %d", got, wantDERP)
}
// Verify the cache still exists and still holds the original value.
b.mu.Lock()
defer b.mu.Unlock()
loaded, ok := b.loadDiskCacheLocked()
if !ok {
t.Fatal("loadDiskCacheLocked returned ok=false; expected cache to still exist")
}
if got := loaded.SelfNode.HomeDERP(); got != wantDERP {
t.Errorf("cached SelfNode.HomeDERP() = %d after rejected event, want original %d", got, wantDERP)
}
})
t.Run("new_does_not_exist_in_map", func(t *testing.T) {
b := newTestBackend(t)
done := make(chan struct{})
tstest.Replace(t, &testOnlyHomeDERPUpdate, func() { close(done) })
const setDERP = 111
const wantDERP = 4
nm := newCacheTestNetmap()
selfNode := nm.SelfNode.AsStruct()
selfNode.HomeDERP = wantDERP
nm.SelfNode = selfNode.View()
b.currentNode().SetNetMap(nm)
b.currentNode().homeDERP.Store(wantDERP)
// Write an initial cache entry so we can verify it is not overwritten.
b.mu.Lock()
if err := b.writeNetmapToDiskLocked(nm); err != nil {
b.mu.Unlock()
t.Fatalf("setup writeNetmapToDiskLocked: %v", err)
}
b.mu.Unlock()
// Publish a HomeDERPChanged event via the backend's event bus.
// Old matches the stored homeDERP so only the "new region not in map"
// guard is exercised.
bus := b.Sys().Bus.Get()
ec := bus.Client("test.TestOnHomeDERPUpdate")
pub := eventbus.Publish[magicsock.HomeDERPChanged](ec)
pub.Publish(magicsock.HomeDERPChanged{Old: wantDERP, New: setDERP})
<-done
if got := b.currentNode().homeDERP.Load(); got != wantDERP {
t.Errorf("b.homeDERP = %d, wanted no change %d", got, wantDERP)
}
// Verify the cache still exists and still holds the original value.
b.mu.Lock()
defer b.mu.Unlock()
loaded, ok := b.loadDiskCacheLocked()
if !ok {
t.Fatal("loadDiskCacheLocked returned ok=false; expected cache to still exist")
}
if got := loaded.SelfNode.HomeDERP(); got != wantDERP {
t.Errorf("cached SelfNode.HomeDERP() = %d after rejected event, want original %d", got, wantDERP)
}
})
}
func TestWriteNetmapDoesNotMutateOriginal(t *testing.T) {
b := newTestBackend(t)
nm := newCacheTestNetmap()
b.currentNode().SetNetMap(nm)
originalDERP := nm.SelfNode.HomeDERP() // expected to be 0 initially
const storeDERP = 5
b.currentNode().homeDERP.Store(storeDERP)
b.mu.Lock()
defer b.mu.Unlock()
if err := b.writeNetmapToDiskLocked(nm); err != nil {
t.Fatalf("writeNetmapToDiskLocked: %v", err)
}
// The original netmap must not have been mutated.
if got := nm.SelfNode.HomeDERP(); got != originalDERP {
t.Errorf("original nm.SelfNode.HomeDERP() = %d after write, want %d (original was mutated)", got, originalDERP)
}
}
+60 -4
View File
@@ -627,6 +627,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
}
eventbus.SubscribeFunc(ec, b.onAppConnectorRouteUpdate)
eventbus.SubscribeFunc(ec, b.onAppConnectorStoreRoutes)
eventbus.SubscribeFunc(ec, b.onHomeDERPUpdate)
mConn.SetNetInfoCallback(b.setNetInfo) // TODO(tailscale/tailscale#17887): move to eventbus
return b, nil
@@ -658,6 +659,51 @@ func (b *LocalBackend) onAppConnectorStoreRoutes(ri appctype.RouteInfo) {
}
}
// testOnlyHomeDERPUpdate if non-nil is called after setting home DERP and
// writing netmap to disk.
var testOnlyHomeDERPUpdate func()
func (b *LocalBackend) onHomeDERPUpdate(du magicsock.HomeDERPChanged) {
b.mu.Lock()
defer b.mu.Unlock()
b.onHomeDERPUpdateLocked(du)
if testOnlyHomeDERPUpdate != nil {
testOnlyHomeDERPUpdate()
}
}
// onHomeDERPUpdateLocked considitonally updates the homeDERP for use in the
// netmap cache.
// If we switched our currentNode by switching profiles, we might be trying
// to update the homeDERP from another profile. If the old homeDERP does not
// match what we expect, don't swap the homeDERP.
// In practice, it is possible that one profile with a homeDERP of 0 (no-derp)
// got switched before setting any home DERP or that DERP IDs match across
// DERP maps. Since the risk of this happening is small and the consequences
// of this is is just a possible less optimal DERP until the next reSTUN,
// accept this possibility.
func (b *LocalBackend) onHomeDERPUpdateLocked(du magicsock.HomeDERPChanged) {
cn := b.currentNode()
if cn == nil || cn.DERPMap() == nil || cn.DERPMap().Regions == nil {
return
}
if _, ok := cn.DERPMap().Regions[du.New]; !ok {
return
}
if !cn.homeDERP.CompareAndSwap(int64(du.Old), int64(du.New)) {
return
}
if err := b.writeNetmapToDiskLocked(b.NetMap()); err != nil {
b.logf("write netmap to cache: %v", err)
}
}
func (b *LocalBackend) Clock() tstime.Clock { return b.clock }
func (b *LocalBackend) Sys() *tsd.System { return b.sys }
@@ -1821,7 +1867,18 @@ func (b *LocalBackend) setControlClientStatusLocked(c controlclient.Client, st c
}
b.e.SetNetworkMap(st.NetMap)
b.MagicConn().SetDERPMap(st.NetMap.DERPMap)
b.MagicConn().SetDERPMap(st.NetMap.DERPMap, false)
if c == nil && st.NetMap.Cached && st.NetMap.SelfNode.Valid() {
// Loading from a cached netmap (c == nil means no live control
// client). Pre-seed the home DERP from the cached self node so
// that the guard in maybeSetNearestDERP prevents changing the
// DERP home before we reconnect to the control plane. If the cache has
// nothing in it, skip this, and let the node pick a DERP itself.
if cachedHome := st.NetMap.SelfNode.HomeDERP(); cachedHome != 0 {
b.health.SetOutOfPollNetMap()
b.MagicConn().ForceSetNearestDERP(cachedHome)
}
}
b.MagicConn().SetOnlyTCP443(st.NetMap.HasCap(tailcfg.NodeAttrOnlyTCP443))
// Update our cached DERP map
@@ -3388,7 +3445,7 @@ func (b *LocalBackend) DebugForceNetmapUpdate() {
nm := b.currentNode().NetMap()
b.e.SetNetworkMap(nm)
if nm != nil {
b.MagicConn().SetDERPMap(nm.DERPMap)
b.MagicConn().SetDERPMap(nm.DERPMap, true)
}
b.setNetMapLocked(nm)
}
@@ -4846,7 +4903,7 @@ func (b *LocalBackend) setPrefsLocked(newp *ipn.Prefs) ipn.PrefsView {
}
if netMap != nil {
b.MagicConn().SetDERPMap(netMap.DERPMap)
b.MagicConn().SetDERPMap(netMap.DERPMap, true)
}
if !oldp.WantRunning() && newp.WantRunning && cc != nil {
@@ -5208,7 +5265,6 @@ func (b *LocalBackend) authReconfig() {
//
// b.mu must be held.
func (b *LocalBackend) authReconfigLocked() {
if b.shutdownCalled {
b.logf("[v1] authReconfig: skipping because in shutdown")
return
+7
View File
@@ -80,6 +80,13 @@ type nodeBackend struct {
eventClient *eventbus.Client
derpMapViewPub *eventbus.Publisher[tailcfg.DERPMapView]
// homeDERP lives here temporarily. as long as mapSession is short lived, we
// don't have a location delivering netmaps to local backend that knows our
// homeDERP hence why it is cached here for now.
// TODO(cmol): move this field into a refactored mapSession that is not
// short lived.
homeDERP atomic.Int64
// TODO(nickkhyl): maybe use sync.RWMutex?
mu syncs.Mutex // protects the following fields