ipn/ipnlocal,wgengine/magicsock: add basic counters for cached peer connectivity (#19699)
Add new clientmetric counters for establishing contact with peers while using cached network map data. To do this, instrument the magicsock.Conn with a bit to indicate whether its peer data came from a cached netmap. If so, there are two conditions we will count as establishing connectivity to a peer: - Receipt of a CallMeMaybe from a peer via disco. - Establishing a valid endpoint address for a peer. In vmtest, add Env.ClientMetrics to scrape metrics from the specified node. Use this to check that counters were updated in caching tests. Updates https://github.com/tailscale/projects/issues/13 Updates #12639 Change-Id: Ie8cf3244ac8af4f5bcfe4d0d944078da2ba08990 Signed-off-by: M. J. Fromberger <fromberger@tailscale.com>
This commit is contained in:
@@ -6587,7 +6587,11 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
|
|||||||
b.currentNode().SetNetMap(nm)
|
b.currentNode().SetNetMap(nm)
|
||||||
if ms, ok := b.sys.MagicSock.GetOK(); ok {
|
if ms, ok := b.sys.MagicSock.GetOK(); ok {
|
||||||
if nm != nil {
|
if nm != nil {
|
||||||
ms.SetNetworkMap(nm.SelfNode, nm.Peers)
|
if nm.Cached {
|
||||||
|
ms.SetNetworkMapCached(nm.SelfNode, nm.Peers)
|
||||||
|
} else {
|
||||||
|
ms.SetNetworkMap(nm.SelfNode, nm.Peers)
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ms.SetNetworkMap(tailcfg.NodeView{}, nil)
|
ms.SetNetworkMap(tailcfg.NodeView{}, nil)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/gopacket/layers"
|
"github.com/google/gopacket/layers"
|
||||||
|
dto "github.com/prometheus/client_model/go"
|
||||||
|
"github.com/prometheus/common/expfmt"
|
||||||
"go4.org/mem"
|
"go4.org/mem"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
"tailscale.com/client/local"
|
"tailscale.com/client/local"
|
||||||
@@ -847,6 +849,68 @@ func (e *Env) Status(n *Node) *ipnstate.Status {
|
|||||||
return st
|
return st
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ClientMetrics returns the client metrics exported by the given node.
|
||||||
|
func (e *Env) ClientMetrics(n *Node) ClientMetrics {
|
||||||
|
e.t.Helper()
|
||||||
|
raw, err := n.Agent().DaemonMetrics(e.t.Context())
|
||||||
|
if err != nil {
|
||||||
|
e.t.Fatalf("Node %q DaemonMetrics: %v", n.Name(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Metrics are reported in Prometheus exposition format.
|
||||||
|
var parser expfmt.TextParser
|
||||||
|
mfs, err := parser.TextToMetricFamilies(bytes.NewReader(raw))
|
||||||
|
if err != nil {
|
||||||
|
e.t.Fatalf("Node %q parse client metrics: %v", n.Name(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tailscale client metrics are all unlabelled integer-valued counters and
|
||||||
|
// gauges, so we don't need to handle the full generality of the Prometheus
|
||||||
|
// representation. If we see anything else, we'll log and skip it.
|
||||||
|
out := make(ClientMetrics)
|
||||||
|
for _, mf := range mfs {
|
||||||
|
name := mf.GetName()
|
||||||
|
if _, ok := out[name]; ok {
|
||||||
|
e.t.Logf("Node %q: duplicate client metric %q (ignored)", n.Name(), name)
|
||||||
|
continue
|
||||||
|
} else if len(mf.Metric) != 1 {
|
||||||
|
e.t.Logf("Node %q: got %d values for client metric %q, want 1 (ignored)", n.Name(), len(mf.Metric), name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var mtype string
|
||||||
|
var value int64
|
||||||
|
switch mf.GetType() {
|
||||||
|
case dto.MetricType_COUNTER:
|
||||||
|
mtype = "counter"
|
||||||
|
value = int64(mf.Metric[0].GetCounter().GetValue())
|
||||||
|
case dto.MetricType_GAUGE:
|
||||||
|
mtype = "gauge"
|
||||||
|
value = int64(mf.Metric[0].GetGauge().GetValue())
|
||||||
|
default:
|
||||||
|
e.t.Logf("Node %q unexpected client metric %q type %q (ignored)", n.Name(), name, mf.GetType().String())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[name] = ClientMetric{
|
||||||
|
Name: name,
|
||||||
|
Type: mtype,
|
||||||
|
Value: value,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// ClientMetrics is a view of the client metrics exported by a node.
|
||||||
|
// The keys of the map are the metric names.
|
||||||
|
type ClientMetrics map[string]ClientMetric
|
||||||
|
|
||||||
|
// ClientMetric is a view of a node client metric.
|
||||||
|
type ClientMetric struct {
|
||||||
|
Name string // as published to the clientmetrics package
|
||||||
|
Type string // either "gauge" or "counter"
|
||||||
|
Value int64 // the gauge or counter value
|
||||||
|
}
|
||||||
|
|
||||||
// SetAcceptRoutes toggles the node's RouteAll preference (the
|
// SetAcceptRoutes toggles the node's RouteAll preference (the
|
||||||
// --accept-routes flag), controlling whether it installs subnet routes
|
// --accept-routes flag), controlling whether it installs subnet routes
|
||||||
// advertised by peers.
|
// advertised by peers.
|
||||||
|
|||||||
@@ -923,6 +923,20 @@ func TestMullvadExitNode(t *testing.T) {
|
|||||||
check(checkOff2Step, "exit-off (again)", clientWAN)
|
check(checkOff2Step, "exit-off (again)", clientWAN)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// checkClientMetrics verifies that each entry in want exists and has the given
|
||||||
|
// value in metrics.
|
||||||
|
func checkClientMetrics(t *testing.T, label string, metrics vmtest.ClientMetrics, want map[string]int64) {
|
||||||
|
t.Helper()
|
||||||
|
for name, wantValue := range want {
|
||||||
|
got, ok := metrics[name]
|
||||||
|
if !ok {
|
||||||
|
t.Errorf("%s: required metric %q not found", label, name)
|
||||||
|
} else if got.Value != wantValue {
|
||||||
|
t.Errorf("%s: metric %q: got %v, want %v", label, name, got.Value, wantValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestCachedNetmapAfterRestart verifies that two nodes with netmap
|
// TestCachedNetmapAfterRestart verifies that two nodes with netmap
|
||||||
// caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct
|
// caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct
|
||||||
// WireGuard tunnel after both are restarted while the control server is
|
// WireGuard tunnel after both are restarted while the control server is
|
||||||
@@ -1020,13 +1034,23 @@ func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) {
|
|||||||
vmtest.OS(vmtest.Gokrazy),
|
vmtest.OS(vmtest.Gokrazy),
|
||||||
tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil})
|
tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil})
|
||||||
|
|
||||||
|
checkInitialMetrics := env.AddStep("Check initial client metrics")
|
||||||
cutControlStep := env.AddStep("Cut control server access")
|
cutControlStep := env.AddStep("Cut control server access")
|
||||||
restartStep := env.AddStep("Restart tailscaled on a")
|
restartStep := env.AddStep("Restart tailscaled on a")
|
||||||
tsmpPingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)")
|
tsmpPingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)")
|
||||||
DiscoPingStep := env.AddStep("Ping a → b Disco (want Direct)")
|
discoPingStep := env.AddStep("Ping a → b Disco (want Direct)")
|
||||||
|
checkFinalMetrics := env.AddStep("Check final client metrics")
|
||||||
|
|
||||||
env.Start()
|
env.Start()
|
||||||
|
|
||||||
|
// Before: Verify that we have not recorded any cached contacts.
|
||||||
|
checkInitialMetrics.Begin()
|
||||||
|
checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{
|
||||||
|
"magicsock_cached_peer_contact_derp": 0,
|
||||||
|
"magicsock_cached_peer_contact_direct": 0,
|
||||||
|
})
|
||||||
|
checkInitialMetrics.End(nil)
|
||||||
|
|
||||||
cutControlStep.Begin()
|
cutControlStep.Begin()
|
||||||
a.DropControlTraffic()
|
a.DropControlTraffic()
|
||||||
env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) {
|
env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) {
|
||||||
@@ -1047,10 +1071,17 @@ func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) {
|
|||||||
}
|
}
|
||||||
tsmpPingStep.End(nil)
|
tsmpPingStep.End(nil)
|
||||||
|
|
||||||
DiscoPingStep.Begin()
|
discoPingStep.Begin()
|
||||||
if err := env.PingExpect(a, b, vmtest.PingRouteDirect, 30*time.Second); err != nil {
|
if err := env.PingExpect(a, b, vmtest.PingRouteDirect, 30*time.Second); err != nil {
|
||||||
DiscoPingStep.End(err)
|
discoPingStep.End(err)
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
DiscoPingStep.End(nil)
|
discoPingStep.End(nil)
|
||||||
|
|
||||||
|
// After: Verify that we recorded a direct contact on the disconnected node.
|
||||||
|
checkFinalMetrics.Begin()
|
||||||
|
checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{
|
||||||
|
"magicsock_cached_peer_contact_direct": 1,
|
||||||
|
})
|
||||||
|
checkFinalMetrics.End(nil)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -138,6 +138,13 @@ func (de *endpoint) udpRelayEndpointReady(maybeBest addrQuality) {
|
|||||||
func (de *endpoint) setBestAddrLocked(v addrQuality) {
|
func (de *endpoint) setBestAddrLocked(v addrQuality) {
|
||||||
if v.epAddr != de.bestAddr.epAddr {
|
if v.epAddr != de.bestAddr.epAddr {
|
||||||
de.probeUDPLifetime.resetCycleEndpointLocked()
|
de.probeUDPLifetime.resetCycleEndpointLocked()
|
||||||
|
|
||||||
|
// Reaching here, if we are using data from a cached netmap and we are
|
||||||
|
// upgrading from an invalid (missing) address to a valid one, increment
|
||||||
|
// the counter for peers established.
|
||||||
|
if !de.bestAddr.ap.IsValid() && v.ap.IsValid() && de.c.usingCachedNetmap.Load() {
|
||||||
|
metricCachedPeerContactDirect.Add(1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
de.bestAddr = v
|
de.bestAddr = v
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -272,6 +272,9 @@ type Conn struct {
|
|||||||
// discoAtomic is the current disco private and public keypair for this conn.
|
// discoAtomic is the current disco private and public keypair for this conn.
|
||||||
discoAtomic discoAtomic
|
discoAtomic discoAtomic
|
||||||
|
|
||||||
|
// usingCacheNetmap is whether the latest update to self and peersByID are from a cached network map
|
||||||
|
usingCachedNetmap atomic.Bool
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// mu guards all following fields; see userspaceEngine lock
|
// mu guards all following fields; see userspaceEngine lock
|
||||||
// ordering rules against the engine. For derphttp, mu must
|
// ordering rules against the engine. For derphttp, mu must
|
||||||
@@ -355,18 +358,19 @@ type Conn struct {
|
|||||||
// magicsock could do with any complexity reduction it can get.
|
// magicsock could do with any complexity reduction it can get.
|
||||||
netInfoLast *tailcfg.NetInfo
|
netInfoLast *tailcfg.NetInfo
|
||||||
|
|
||||||
derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled
|
derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled
|
||||||
self tailcfg.NodeView // from last SetNetworkMap
|
self tailcfg.NodeView // from last SetNetworkMap
|
||||||
peersByID map[tailcfg.NodeID]tailcfg.NodeView // current peer set, keyed by NodeID. Maintained by SetNetworkMap/UpsertPeer/RemovePeer. Note: per-field NodeMutation patches received in UpdateNetmapDelta are never applied to these snapshots.
|
peersByID map[tailcfg.NodeID]tailcfg.NodeView // current peer set, keyed by NodeID. Maintained by SetNetworkMap/UpsertPeer/RemovePeer. Note: per-field NodeMutation patches received in UpdateNetmapDelta are never applied to these snapshots.
|
||||||
filt *filter.Filter // from last SetFilter
|
|
||||||
relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers
|
filt *filter.Filter // from last SetFilter
|
||||||
lastFlags debugFlags // at time of last SetNetworkMap
|
relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers
|
||||||
privateKey key.NodePrivate // WireGuard private key for this node
|
lastFlags debugFlags // at time of last SetNetworkMap
|
||||||
everHadKey bool // whether we ever had a non-zero private key
|
privateKey key.NodePrivate // WireGuard private key for this node
|
||||||
myDerp int // nearest DERP region ID; 0 means none/unknown
|
everHadKey bool // whether we ever had a non-zero private key
|
||||||
homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0)
|
myDerp int // nearest DERP region ID; 0 means none/unknown
|
||||||
derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close
|
homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0)
|
||||||
activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region
|
derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close
|
||||||
|
activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region
|
||||||
prevDerp map[int]*syncs.WaitGroupChan
|
prevDerp map[int]*syncs.WaitGroupChan
|
||||||
|
|
||||||
// derpRoute contains optional alternate routes to use as an
|
// derpRoute contains optional alternate routes to use as an
|
||||||
@@ -2334,6 +2338,13 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake
|
|||||||
c.logf("[unexpected] %s from peer via DERP whose netmap discokey != disco source", msgType)
|
c.logf("[unexpected] %s from peer via DERP whose netmap discokey != disco source", msgType)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reaching here, if we are using data from a cached network map the
|
||||||
|
// receipt of a CallMeMaybe from a peer indicates we have a sufficiently
|
||||||
|
// viable connection to that peer to count it as active while cached.
|
||||||
|
if c.usingCachedNetmap.Load() {
|
||||||
|
metricCachedPeerContactDERP.Add(1)
|
||||||
|
}
|
||||||
if isVia {
|
if isVia {
|
||||||
c.dlogf("[v1] magicsock: disco: %v<-%v via %v (%v, %v) got call-me-maybe-via, %d endpoints",
|
c.dlogf("[v1] magicsock: disco: %v<-%v via %v (%v, %v) got call-me-maybe-via, %d endpoints",
|
||||||
c.discoAtomic.Short(), epDisco.short, via.ServerDisco.ShortString(),
|
c.discoAtomic.Short(), epDisco.short, via.ServerDisco.ShortString(),
|
||||||
@@ -2954,9 +2965,10 @@ func (c *candidatePeerRelay) isValid() bool {
|
|||||||
return !c.nodeKey.IsZero() && !c.discoKey.IsZero()
|
return !c.nodeKey.IsZero() && !c.discoKey.IsZero()
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetNetworkMap updates the network map with the given self node and peers.
|
// SetNetworkMap updates the network map with the given self node and peers
|
||||||
// It must be called synchronously from the caller's goroutine to ensure
|
// reported by the control plane (rather than cached). It must be called
|
||||||
// magicsock has the current state before subsequent operations proceed.
|
// synchronously from the caller's goroutine to ensure magicsock has the
|
||||||
|
// current state before subsequent operations proceed.
|
||||||
//
|
//
|
||||||
// self may be invalid if there's no network map.
|
// self may be invalid if there's no network map.
|
||||||
//
|
//
|
||||||
@@ -2966,6 +2978,18 @@ func (c *candidatePeerRelay) isValid() bool {
|
|||||||
// initial netmap and for changes to self or to global state (filter, DERP,
|
// initial netmap and for changes to self or to global state (filter, DERP,
|
||||||
// etc.) that aren't covered by the per-peer methods.
|
// etc.) that aren't covered by the per-peer methods.
|
||||||
func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
|
func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
|
||||||
|
c.setNetworkMapInternal(self, peers, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetNetworkMapCached behaves as SetNetworkMap, but indicates to c that the
|
||||||
|
// data provided are from a cache rather than the control plane. The same
|
||||||
|
// constraints otherwise apply.
|
||||||
|
func (c *Conn) SetNetworkMapCached(self tailcfg.NodeView, peers []tailcfg.NodeView) {
|
||||||
|
c.setNetworkMapInternal(self, peers, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// setNetworkMapInternal is the shared implementation of SetNetworkMap and SetNetworkMapCached.
|
||||||
|
func (c *Conn) setNetworkMapInternal(self tailcfg.NodeView, peers []tailcfg.NodeView, isCached bool) {
|
||||||
peersChanged := c.updateNodes(self, peers)
|
peersChanged := c.updateNodes(self, peers)
|
||||||
|
|
||||||
relayClientEnabled := self.Valid() &&
|
relayClientEnabled := self.Valid() &&
|
||||||
@@ -2979,6 +3003,7 @@ func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
|
|||||||
selfView := c.self
|
selfView := c.self
|
||||||
peersSnap := c.peerSnapshotLocked()
|
peersSnap := c.peerSnapshotLocked()
|
||||||
isClosed := c.closed
|
isClosed := c.closed
|
||||||
|
c.usingCachedNetmap.Store(isCached)
|
||||||
c.mu.Unlock() // release c.mu before potentially calling c.updateRelayServersSet which is O(m * n)
|
c.mu.Unlock() // release c.mu before potentially calling c.updateRelayServersSet which is O(m * n)
|
||||||
|
|
||||||
if isClosed {
|
if isClosed {
|
||||||
@@ -4198,6 +4223,10 @@ var (
|
|||||||
metricTSMPDiscoKeyAdvertisementReceived = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_received")
|
metricTSMPDiscoKeyAdvertisementReceived = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_received")
|
||||||
metricTSMPDiscoKeyAdvertisementApplied = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_applied")
|
metricTSMPDiscoKeyAdvertisementApplied = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_applied")
|
||||||
metricTSMPDiscoKeyAdvertisementUnchanged = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_unchanged")
|
metricTSMPDiscoKeyAdvertisementUnchanged = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_unchanged")
|
||||||
|
|
||||||
|
// Counters for peer contacts established using cached network map data.
|
||||||
|
metricCachedPeerContactDERP = clientmetric.NewCounter("magicsock_cached_peer_contact_derp")
|
||||||
|
metricCachedPeerContactDirect = clientmetric.NewCounter("magicsock_cached_peer_contact_direct")
|
||||||
)
|
)
|
||||||
|
|
||||||
// newUDPLifetimeCounter returns a new *clientmetric.Metric with the provided
|
// newUDPLifetimeCounter returns a new *clientmetric.Metric with the provided
|
||||||
|
|||||||
Reference in New Issue
Block a user