ipn/ipnlocal,wgengine/magicsock: add basic counters for cached peer connectivity (#19699)
Add new clientmetric counters for establishing contact with peers while using cached network map data. To do this, instrument the magicsock.Conn with a bit to indicate whether its peer data came from a cached netmap. If so, there are two conditions we will count as establishing connectivity to a peer: - Receipt of a CallMeMaybe from a peer via disco. - Establishing a valid endpoint address for a peer. In vmtest, add Env.ClientMetrics to scrape metrics from the specified node. Use this to check that counters were updated in caching tests. Updates https://github.com/tailscale/projects/issues/13 Updates #12639 Change-Id: Ie8cf3244ac8af4f5bcfe4d0d944078da2ba08990 Signed-off-by: M. J. Fromberger <fromberger@tailscale.com>
This commit is contained in:
@@ -38,6 +38,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/google/gopacket/layers"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
"go4.org/mem"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"tailscale.com/client/local"
|
||||
@@ -847,6 +849,68 @@ func (e *Env) Status(n *Node) *ipnstate.Status {
|
||||
return st
|
||||
}
|
||||
|
||||
// ClientMetrics returns the client metrics exported by the given node.
|
||||
func (e *Env) ClientMetrics(n *Node) ClientMetrics {
|
||||
e.t.Helper()
|
||||
raw, err := n.Agent().DaemonMetrics(e.t.Context())
|
||||
if err != nil {
|
||||
e.t.Fatalf("Node %q DaemonMetrics: %v", n.Name(), err)
|
||||
}
|
||||
|
||||
// Metrics are reported in Prometheus exposition format.
|
||||
var parser expfmt.TextParser
|
||||
mfs, err := parser.TextToMetricFamilies(bytes.NewReader(raw))
|
||||
if err != nil {
|
||||
e.t.Fatalf("Node %q parse client metrics: %v", n.Name(), err)
|
||||
}
|
||||
|
||||
// Tailscale client metrics are all unlabelled integer-valued counters and
|
||||
// gauges, so we don't need to handle the full generality of the Prometheus
|
||||
// representation. If we see anything else, we'll log and skip it.
|
||||
out := make(ClientMetrics)
|
||||
for _, mf := range mfs {
|
||||
name := mf.GetName()
|
||||
if _, ok := out[name]; ok {
|
||||
e.t.Logf("Node %q: duplicate client metric %q (ignored)", n.Name(), name)
|
||||
continue
|
||||
} else if len(mf.Metric) != 1 {
|
||||
e.t.Logf("Node %q: got %d values for client metric %q, want 1 (ignored)", n.Name(), len(mf.Metric), name)
|
||||
continue
|
||||
}
|
||||
|
||||
var mtype string
|
||||
var value int64
|
||||
switch mf.GetType() {
|
||||
case dto.MetricType_COUNTER:
|
||||
mtype = "counter"
|
||||
value = int64(mf.Metric[0].GetCounter().GetValue())
|
||||
case dto.MetricType_GAUGE:
|
||||
mtype = "gauge"
|
||||
value = int64(mf.Metric[0].GetGauge().GetValue())
|
||||
default:
|
||||
e.t.Logf("Node %q unexpected client metric %q type %q (ignored)", n.Name(), name, mf.GetType().String())
|
||||
continue
|
||||
}
|
||||
out[name] = ClientMetric{
|
||||
Name: name,
|
||||
Type: mtype,
|
||||
Value: value,
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ClientMetrics is a view of the client metrics exported by a node.
|
||||
// The keys of the map are the metric names.
|
||||
type ClientMetrics map[string]ClientMetric
|
||||
|
||||
// ClientMetric is a view of a node client metric.
|
||||
type ClientMetric struct {
|
||||
Name string // as published to the clientmetrics package
|
||||
Type string // either "gauge" or "counter"
|
||||
Value int64 // the gauge or counter value
|
||||
}
|
||||
|
||||
// SetAcceptRoutes toggles the node's RouteAll preference (the
|
||||
// --accept-routes flag), controlling whether it installs subnet routes
|
||||
// advertised by peers.
|
||||
|
||||
@@ -923,6 +923,20 @@ func TestMullvadExitNode(t *testing.T) {
|
||||
check(checkOff2Step, "exit-off (again)", clientWAN)
|
||||
}
|
||||
|
||||
// checkClientMetrics verifies that each entry in want exists and has the given
|
||||
// value in metrics.
|
||||
func checkClientMetrics(t *testing.T, label string, metrics vmtest.ClientMetrics, want map[string]int64) {
|
||||
t.Helper()
|
||||
for name, wantValue := range want {
|
||||
got, ok := metrics[name]
|
||||
if !ok {
|
||||
t.Errorf("%s: required metric %q not found", label, name)
|
||||
} else if got.Value != wantValue {
|
||||
t.Errorf("%s: metric %q: got %v, want %v", label, name, got.Value, wantValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestCachedNetmapAfterRestart verifies that two nodes with netmap
|
||||
// caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct
|
||||
// WireGuard tunnel after both are restarted while the control server is
|
||||
@@ -1020,13 +1034,23 @@ func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) {
|
||||
vmtest.OS(vmtest.Gokrazy),
|
||||
tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil})
|
||||
|
||||
checkInitialMetrics := env.AddStep("Check initial client metrics")
|
||||
cutControlStep := env.AddStep("Cut control server access")
|
||||
restartStep := env.AddStep("Restart tailscaled on a")
|
||||
tsmpPingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)")
|
||||
DiscoPingStep := env.AddStep("Ping a → b Disco (want Direct)")
|
||||
discoPingStep := env.AddStep("Ping a → b Disco (want Direct)")
|
||||
checkFinalMetrics := env.AddStep("Check final client metrics")
|
||||
|
||||
env.Start()
|
||||
|
||||
// Before: Verify that we have not recorded any cached contacts.
|
||||
checkInitialMetrics.Begin()
|
||||
checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{
|
||||
"magicsock_cached_peer_contact_derp": 0,
|
||||
"magicsock_cached_peer_contact_direct": 0,
|
||||
})
|
||||
checkInitialMetrics.End(nil)
|
||||
|
||||
cutControlStep.Begin()
|
||||
a.DropControlTraffic()
|
||||
env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) {
|
||||
@@ -1047,10 +1071,17 @@ func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) {
|
||||
}
|
||||
tsmpPingStep.End(nil)
|
||||
|
||||
DiscoPingStep.Begin()
|
||||
discoPingStep.Begin()
|
||||
if err := env.PingExpect(a, b, vmtest.PingRouteDirect, 30*time.Second); err != nil {
|
||||
DiscoPingStep.End(err)
|
||||
discoPingStep.End(err)
|
||||
t.Fatal(err)
|
||||
}
|
||||
DiscoPingStep.End(nil)
|
||||
discoPingStep.End(nil)
|
||||
|
||||
// After: Verify that we recorded a direct contact on the disconnected node.
|
||||
checkFinalMetrics.Begin()
|
||||
checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{
|
||||
"magicsock_cached_peer_contact_direct": 1,
|
||||
})
|
||||
checkFinalMetrics.End(nil)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user