WIP: rebase for 2026-05-18 #7

Draft
codinget wants to merge 234 commits from rebase/2026-05-18 into webnet
7 changed files with 519 additions and 11 deletions
Showing only changes of commit 15cba0a3f6 - Show all commits
+47
View File
@@ -0,0 +1,47 @@
// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package main
import (
"fmt"
"os"
"strconv"
"strings"
)
func init() {
restartTailscaled = restartTailscaledLinux
}
// restartTailscaledLinux finds the tailscaled process by walking /proc and
// sends it SIGKILL. On gokrazy, the supervisor will restart tailscaled within
// a few seconds. The PID of the process that was killed is returned.
func restartTailscaledLinux() (int, error) {
ents, err := os.ReadDir("/proc")
if err != nil {
return 0, err
}
for _, e := range ents {
pid, err := strconv.Atoi(e.Name())
if err != nil {
continue
}
comm, err := os.ReadFile("/proc/" + e.Name() + "/comm")
if err != nil {
continue
}
if strings.TrimSpace(string(comm)) != "tailscaled" {
continue
}
proc, err := os.FindProcess(pid)
if err != nil {
return 0, err
}
if err := proc.Kill(); err != nil {
return 0, fmt.Errorf("killing tailscaled pid %d: %w", pid, err)
}
return pid, nil
}
return 0, fmt.Errorf("tailscaled process not found in /proc")
}
+17
View File
@@ -391,6 +391,18 @@ func main() {
}
wgServerUp(w, r)
})
ttaMux.HandleFunc("/restart-tailscaled", func(w http.ResponseWriter, r *http.Request) {
if restartTailscaled == nil {
http.Error(w, "restart-tailscaled not supported on this platform", http.StatusNotImplemented)
return
}
pid, err := restartTailscaled()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
fmt.Fprintf(w, "killed tailscaled pid %d (supervisor will respawn)\n", pid)
})
ttaMux.HandleFunc("/logs", func(w http.ResponseWriter, r *http.Request) {
logBuf.mu.Lock()
defer logBuf.mu.Unlock()
@@ -600,6 +612,11 @@ var addFirewall func() error // set by fw_linux.go
// non-Linux.
var wgServerUp func(w http.ResponseWriter, r *http.Request)
// restartTailscaled sends SIGKILL to the local tailscaled process so the
// gokrazy supervisor restarts it. It is set by restart_tailscaled_linux.go
// and is nil on non-Linux.
var restartTailscaled func() (pid int, err error)
// logBuffer is a bytes.Buffer that is safe for concurrent use
// intended to capture early logs from the process, even if
// gokrazy's syslog streaming isn't working or yet working.
+36
View File
@@ -7093,11 +7093,28 @@ func (b *LocalBackend) DebugRotateDiscoKey() error {
b.mu.Lock()
cc := b.cc
wantRunning := b.pm.CurrentPrefs().WantRunning()
b.mu.Unlock()
if cc != nil {
cc.SetDiscoPublicKey(newDiscoKey)
}
// Bounce WantRunning to fully reset wireguard-go state for all peers.
if wantRunning {
if _, err := b.EditPrefs(&ipn.MaskedPrefs{
Prefs: ipn.Prefs{WantRunning: false},
WantRunningSet: true,
}); err != nil {
return err
}
if _, err := b.EditPrefs(&ipn.MaskedPrefs{
Prefs: ipn.Prefs{WantRunning: true},
WantRunningSet: true,
}); err != nil {
return err
}
}
return nil
}
@@ -7105,6 +7122,25 @@ func (b *LocalBackend) DebugPeerRelayServers() set.Set[netip.Addr] {
return b.MagicConn().PeerRelays()
}
// DebugPeerDiscoKeys returns the disco public keys this node has learned for
// each of its peers from the most recent network map. Intended for tests
// (the production [ipnstate.PeerStatus] purposefully does not surface disco
// keys; surfacing them via the [ipnstate.Status] API would also pollute
// every PeerStatus consumer with a non-comparable struct field).
func (b *LocalBackend) DebugPeerDiscoKeys() map[key.NodePublic]key.DiscoPublic {
nm := b.currentNode().NetMap()
if nm == nil {
return nil
}
m := make(map[key.NodePublic]key.DiscoPublic, len(nm.Peers))
for _, p := range nm.Peers {
if dk := p.DiscoKey(); !dk.IsZero() {
m[p.Key()] = dk
}
}
return m
}
// ControlKnobs returns the node's control knobs.
func (b *LocalBackend) ControlKnobs() *controlknobs.Knobs {
return b.sys.ControlKnobs()
+6
View File
@@ -232,6 +232,12 @@ func (h *Handler) serveDebug(w http.ResponseWriter, r *http.Request) {
if err == nil {
return
}
case "peer-disco-keys":
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(h.b.DebugPeerDiscoKeys())
if err == nil {
return
}
case "rotate-disco-key":
err = h.b.DebugRotateDiscoKey()
case "statedir":
@@ -69,6 +69,22 @@ type Server struct {
// belong to the same user.
AllNodesSameUser bool
// AllOnline, if true, marks every peer entry in MapResponses as
// Online=true. This is a coarse stand-in for the per-node
// online/offline tracking that production control servers do based
// on streaming map sessions: certain disco-key handling fast paths
// in [tailscale.com/control/controlclient] and
// [tailscale.com/wgengine/userspace] only fire when the peer is
// reported online, so without this flag they are silently skipped
// in tests, which can mask bugs and slow down recovery from disco
// rotations. See [tailscale.com/control/controlclient/map.go]
// removeUnwantedDiscoUpdates and
// removeUnwantedDiscoUpdatesFromFullNetmapUpdate for callers that
// branch on Online.
//
// Finer-grained per-node online tracking can be added later.
AllOnline bool
// DefaultNodeCapabilities overrides the capability map sent to each client.
DefaultNodeCapabilities *tailcfg.NodeCapMap
@@ -1405,6 +1421,9 @@ func (s *Server) MapResponse(req *tailcfg.MapRequest) (res *tailcfg.MapResponse,
p.PrimaryRoutes = routes
p.AllowedIPs = append(p.AllowedIPs, routes...)
}
if s.AllOnline {
p.Online = new(true)
}
res.Peers = append(res.Peers, p)
}
+171 -11
View File
@@ -19,6 +19,7 @@ import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"flag"
"fmt"
"io"
@@ -43,6 +44,7 @@ import (
"tailscale.com/ipn"
"tailscale.com/ipn/ipnstate"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/tstest/integration/testcontrol"
"tailscale.com/tstest/natlab/vnet"
"tailscale.com/types/key"
@@ -85,6 +87,7 @@ type Env struct {
qemuProcs []*exec.Cmd // launched QEMU processes
sameTailnetUser bool // all nodes register as the same Tailnet user
allOnline bool // mark every peer as Online=true in MapResponses
// Shared resource initialization (sync.Once for things multiple nodes share).
vnetOnce sync.Once
@@ -346,6 +349,16 @@ func SameTailnetUser() EnvOption {
return envOptFunc(func(e *Env) { e.sameTailnetUser = true })
}
// AllOnline returns an [EnvOption] that makes the test control server mark
// every peer as Online=true in MapResponses (testcontrol.Server.AllOnline).
// Several disco-key handling fast paths in the controlclient and wgengine
// only fire when the peer is reported online; without this option those
// paths are silently skipped, which can mask bugs and slow down recovery
// from disco-key rotations.
func AllOnline() EnvOption {
return envOptFunc(func(e *Env) { e.allOnline = true })
}
// AddNetwork creates a new virtual network. Arguments follow the same pattern as
// vnet.Config.AddNetwork (string IPs, NAT types, NetworkService values).
func (e *Env) AddNetwork(opts ...any) *vnet.Network {
@@ -414,6 +427,11 @@ func (e *Env) AddNode(name string, opts ...any) *Node {
// LanIP returns the LAN IPv4 address of this node on the given network.
// This is only valid after Env.Start() has been called.
// Name returns the node's name as set in [Env.AddNode].
func (n *Node) Name() string {
return n.name
}
func (n *Node) LanIP(net *vnet.Network) netip.Addr {
return n.vnetNode.LanIP(net)
}
@@ -864,33 +882,172 @@ func (e *Env) ApproveRoutes(n *Node, routes ...string) {
}
}
// ping pings from one node to another's Tailscale IP, retrying until it succeeds
// or the timeout expires. This establishes the WireGuard tunnel between the nodes.
// ping does a disco ping from one node to another's Tailscale IP, retrying
// for up to 30 seconds, fataling on failure. It is used internally to wake
// up magicsock peer state before a test runs; tests that want to assert
// connectivity should use [Env.Ping] with the appropriate ping type and
// timeout.
func (e *Env) ping(from, to *Node) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
e.t.Helper()
if err := e.Ping(from, to, tailcfg.PingDisco, 30*time.Second); err != nil {
e.t.Fatal(err)
}
}
// Ping pings from one node to another's Tailscale IP using the given ping
// type, retrying until it succeeds or timeout expires. It returns the error
// from the last attempt if the timeout expires. Unlike the internal ping
// helper, it does not fatal the test on failure; callers can check the error
// to assert on timing.
//
// [tailcfg.PingTSMP] actually flows packets across the WireGuard tunnel and is
// the right choice for asserting end-to-end connectivity.
// [tailcfg.PingDisco] only exchanges disco messages between magicsock layers
// and is useful for warming up peer state without requiring a working tunnel.
func (e *Env) Ping(from, to *Node, ptype tailcfg.PingType, timeout time.Duration) error {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
toSt, err := to.agent.Status(ctx)
if err != nil {
e.t.Fatalf("ping: can't get %s status: %v", to.name, err)
return fmt.Errorf("ping: can't get %s status: %w", to.name, err)
}
if len(toSt.Self.TailscaleIPs) == 0 {
e.t.Fatalf("ping: %s has no Tailscale IPs", to.name)
return fmt.Errorf("ping: %s has no Tailscale IPs", to.name)
}
targetIP := toSt.Self.TailscaleIPs[0]
var lastErr error
for {
pingCtx, pingCancel := context.WithTimeout(ctx, 3*time.Second)
pr, err := from.agent.PingWithOpts(pingCtx, targetIP, tailcfg.PingDisco, local.PingOpts{})
// Per-attempt timeout: cap at 3s but never exceed the remaining budget.
attemptTimeout := 3 * time.Second
if d := time.Until(deadline(ctx)); d < attemptTimeout {
attemptTimeout = d
}
if attemptTimeout <= 0 {
break
}
pingCtx, pingCancel := context.WithTimeout(ctx, attemptTimeout)
pr, err := from.agent.PingWithOpts(pingCtx, targetIP, ptype, local.PingOpts{})
pingCancel()
if err == nil && pr.Err == "" {
e.logVerbosef("ping: %s -> %s OK", from.name, targetIP)
return
e.logVerbosef("ping(%s): %s -> %s OK", ptype, from.name, targetIP)
return nil
}
switch {
case err != nil:
lastErr = err
case pr.Err != "":
lastErr = fmt.Errorf("%s", pr.Err)
}
if ctx.Err() != nil {
e.t.Fatalf("ping: %s -> %s timed out", from.name, targetIP)
break
}
time.Sleep(time.Second)
time.Sleep(500 * time.Millisecond)
}
if lastErr == nil {
lastErr = ctx.Err()
}
return fmt.Errorf("ping(%s): %s -> %s (%s) timed out after %v: %w", ptype, from.name, to.name, targetIP, timeout, lastErr)
}
// deadline returns ctx's deadline, or a zero Time if it has none.
func deadline(ctx context.Context) time.Time {
d, _ := ctx.Deadline()
return d
}
// PeerDiscoKey returns n's view of the given peer's disco key. It returns a
// non-nil error if the LocalAPI request fails (e.g. tailscaled briefly
// unavailable during a restart). It returns (zero, false, nil) if n is
// reachable but has no record of the given peer in its current netmap.
//
// PeerDiscoKey is suitable for use inside a [tstest.WaitFor] poll loop: it
// does not fatal the test on transient errors.
//
// The disco key is fetched from the debug-only "peer-disco-keys" LocalAPI
// action ([ipnlocal.LocalBackend.DebugPeerDiscoKeys]) rather than via
// [ipnstate.Status], to keep the production PeerStatus struct free of disco
// keys (and free of non-comparable fields like [key.DiscoPublic] that break
// reflect-based test helpers).
func (e *Env) PeerDiscoKey(n *Node, peer key.NodePublic) (key.DiscoPublic, bool, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
got, err := n.agent.DebugResultJSON(ctx, "peer-disco-keys")
if err != nil {
return key.DiscoPublic{}, false, err
}
// DebugResultJSON returns the result as a generic any (the body is
// re-decoded into any), so the map comes back keyed by string text-
// encoded node keys. Re-marshal+unmarshal into a typed map for cleaner
// lookup. (Roundtripping through JSON is fine for a test helper.)
raw, err := json.Marshal(got)
if err != nil {
return key.DiscoPublic{}, false, fmt.Errorf("re-marshal: %w", err)
}
var m map[key.NodePublic]key.DiscoPublic
if err := json.Unmarshal(raw, &m); err != nil {
return key.DiscoPublic{}, false, fmt.Errorf("unmarshal peer-disco-keys: %w", err)
}
d, ok := m[peer]
return d, ok, nil
}
// RotateDiscoKey asks tailscaled on n to rotate its discovery (magicsock) key
// in place via the LocalAPI debug action. The node key, control connection,
// and other tailscaled state are unaffected. It fatals the test on error.
func (e *Env) RotateDiscoKey(n *Node) {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := n.agent.DebugAction(ctx, "rotate-disco-key"); err != nil {
e.t.Fatalf("RotateDiscoKey(%s): %v", n.name, err)
}
}
// RestartTailscaled signals tailscaled on n to die so that its supervisor
// (gokrazy) restarts it. It then waits for tailscaled to come back to the
// "Running" backend state. It fatals the test on error.
//
// Restarting tailscaled is currently only supported on gokrazy nodes.
func (e *Env) RestartTailscaled(n *Node) {
e.t.Helper()
if !n.os.IsGokrazy {
e.t.Fatalf("RestartTailscaled(%s): only supported on gokrazy nodes (have %q)", n.name, n.os.Name)
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/restart-tailscaled", nil)
if err != nil {
e.t.Fatalf("RestartTailscaled(%s): %v", n.name, err)
}
res, err := n.agent.HTTPClient.Do(req)
if err != nil {
e.t.Fatalf("RestartTailscaled(%s): %v", n.name, err)
}
body, _ := io.ReadAll(res.Body)
res.Body.Close()
if res.StatusCode != 200 {
e.t.Fatalf("RestartTailscaled(%s): %s: %s", n.name, res.Status, body)
}
e.t.Logf("[%s] %s", n.name, strings.TrimSpace(string(body)))
// Wait for tailscaled to come back. Status calls will fail while the unix
// socket is gone, then return Starting/NeedsLogin briefly before settling
// on Running.
if err := tstest.WaitFor(45*time.Second, func() error {
st, err := n.agent.Status(ctx)
if err != nil {
return err
}
if st.BackendState != "Running" {
return fmt.Errorf("backend state = %q", st.BackendState)
}
return nil
}); err != nil {
e.t.Fatalf("RestartTailscaled(%s): waiting for Running: %v", n.name, err)
}
}
@@ -1094,6 +1251,9 @@ func (e *Env) initVnet() {
if e.sameTailnetUser {
e.server.ControlServer().AllNodesSameUser = true
}
if e.allOnline {
e.server.ControlServer().AllOnline = true
}
})
}
+223
View File
@@ -9,11 +9,14 @@ import (
"net/netip"
"strings"
"testing"
"time"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/tstest/integration/testcontrol"
"tailscale.com/tstest/natlab/vmtest"
"tailscale.com/tstest/natlab/vnet"
"tailscale.com/types/key"
)
func TestMacOSAndLinuxCanPing(t *testing.T) {
@@ -534,6 +537,226 @@ func TestExitNode(t *testing.T) {
}
}
// TestDiscoKeyChange verifies that when one node's disco key rotates without
// its WireGuard node key changing, peers detect the change, tear down stale
// WireGuard session state for that peer, and re-establish the tunnel in both
// directions. This exercises the disco-key-change handling that the
// bradfitz/rm_lazy_wg branch relies on for traffic to and from a peer whose
// magicsock state has been reset.
//
// Topology: two gokrazy nodes A and B, each on its own One2OneNAT network so
// every connection between them is a direct UDP path with no port-mapping or
// filtering. With NAT effects out of the way, what we measure here is the
// speed of disco-key-change reconciliation in wgengine/magicsock alone. The
// test control server is also configured with [testcontrol.Server.AllOnline]
// (via [vmtest.AllOnline]) so the controlclient/wgengine fast paths that
// branch on Online actually fire — without that flag the test exercises
// only the offline-peer code paths, which mask separate latent issues and
// are several seconds slower.
//
// The test runs four B-side rotations followed by a TSMP ping in the
// requested direction:
//
// rotate (LocalAPI rotate-disco-key) → ping B → A
// rotate (LocalAPI rotate-disco-key) → ping A → B
// restart (SIGKILL tailscaled) → ping B → A
// restart (SIGKILL tailscaled) → ping A → B
//
// Plus an initial A→B TSMP ping with a generous 30s budget to bring up the
// WireGuard tunnel before the rotations begin (so the post-rotation pings
// measure stale-state recovery, not first-time setup). All pings are TSMP
// because TSMP traverses the actual WireGuard data plane; PingDisco only
// exercises the magicsock disco layer and would mask any stale WG session
// problems.
//
// Two rotation methods are exercised:
//
// - LocalAPI rotate-disco-key (debug action): rolls B's magicsock disco
// private key in place, then bounces WantRunning to force wgengine to
// drop wireguard-go session keys for every peer (RotateDiscoKey alone
// only touches local disco state; without the WantRunning bounce, B
// keeps using stale per-peer session keys against A and A drops
// everything until B's WG rekey timer eventually fires).
// - SIGKILL of tailscaled (via TTA's /kill-tailscaled): the gokrazy
// supervisor respawns tailscaled, fully resetting B's magicsock and
// wgengine state in addition to rotating the disco key.
//
// Each post-rotation ping currently gets a 15-second budget. On a
// hypothetical perfect build it should take well under a second. In
// practice today there are two unavoidable multi-second waits:
//
// - The rotate-then-a→b phase on main takes ~10s for LazyWG. After
// B's WantRunning bounce, B's wgengine resets its sentActivityAt/
// recvActivityAt maps and trims A out of the wireguard-go config
// as an "idle peer"; B only re-adds A on inbound activity, by
// which point A's first few TSMP packets have been silently
// dropped at B's tundev. The bradfitz/rm_lazy_wg branch removes
// that trimming entirely (verified locally), so this phase will
// drop to <100ms once that branch lands.
//
// - The restart phases take ~5s for the wireguard-go handshake retry
// timer. After SIGKILL+respawn the first WG handshake init from
// the restarted node sometimes goes into the void (likely the
// brief peer-removed window in the receiver's two-step
// [wgengine.userspaceEngine.maybeReconfigWireguardLocked] reconfig
// during which the peer is absent from wireguard-go), and wg-go's
// [device.RekeyTimeout] of 5s + jitter is the next opportunity to
// retry. That retry succeeds and the staged TSMP packet flushes.
// This is intrinsic to the protocol's retransmit policy.
//
// Once LazyWG is removed and the first-handshake-after-reconfig race
// is fixed, this budget should be tightened to 5s (or less).
//
// All four rotations also assert that B's WireGuard node key is unchanged.
func TestDiscoKeyChange(t *testing.T) {
// AllOnline makes the test control server mark every peer as Online=true
// in its MapResponses. Several disco-key handling fast paths
// (controlclient.removeUnwantedDiscoUpdates,
// removeUnwantedDiscoUpdatesFromFullNetmapUpdate, and the wgengine
// tsmpLearnedDisco fast path) only fire for online peers. Production
// control servers always populate Online; without this flag the test
// would only exercise the offline-peer paths.
env := vmtest.New(t, vmtest.AllOnline())
// One2OneNAT so each node has a 1:1 mapping to a public WAN IP with no
// port-translation or address-port filtering. This makes A↔B traffic
// behave like two unfirewalled hosts on the public internet, so any
// slowness we observe in this test cannot be blamed on NAT traversal.
aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.One2OneNAT)
bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.One2OneNAT)
a := env.AddNode("a", aNet, vmtest.OS(vmtest.Gokrazy))
b := env.AddNode("b", bNet, vmtest.OS(vmtest.Gokrazy))
type phase struct {
name string
rotate func()
pingFrom *vmtest.Node
pingTo *vmtest.Node
applyStep *vmtest.Step
verify *vmtest.Step
wait *vmtest.Step
ping *vmtest.Step
}
phases := []*phase{
{name: "rotate (LocalAPI), b → a", pingFrom: b, pingTo: a, rotate: func() { env.RotateDiscoKey(b) }},
{name: "rotate (LocalAPI), a → b", pingFrom: a, pingTo: b, rotate: func() { env.RotateDiscoKey(b) }},
{name: "restart, b → a", pingFrom: b, pingTo: a, rotate: func() { env.RestartTailscaled(b) }},
{name: "restart, a → b", pingFrom: a, pingTo: b, rotate: func() { env.RestartTailscaled(b) }},
}
pingABStep := env.AddStep("Ping a → b TSMP (establish tunnel)")
for _, p := range phases {
p.applyStep = env.AddStep("Apply: " + p.name)
p.verify = env.AddStep("Verify b: same node key, new disco key (" + p.name + ")")
p.wait = env.AddStep("Wait for a to see b's new disco key (" + p.name + ")")
p.ping = env.AddStep("Ping " + p.pingFrom.Name() + " → " + p.pingTo.Name() + " TSMP (" + p.name + ")")
}
env.Start()
pingABStep.Begin()
if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil {
pingABStep.End(err)
t.Fatal(err)
}
pingABStep.End(nil)
bStInitial := env.Status(b)
bNodeKey := bStInitial.Self.PublicKey
cs := env.ControlServer()
bCtlNode := cs.Node(bNodeKey)
if bCtlNode == nil {
t.Fatalf("control server has no node for b's key %v", bNodeKey)
}
prevDisco := bCtlNode.DiscoKey
if prevDisco.IsZero() {
t.Fatalf("control server has no disco key for b before rotation")
}
t.Logf("[b] initial: nodekey=%s discokey=%s", bNodeKey.ShortString(), prevDisco.ShortString())
for _, p := range phases {
p.applyStep.Begin()
p.rotate()
p.applyStep.End(nil)
prevDisco = checkDiscoRotated(t, env, a, b, p.pingFrom, p.pingTo, bNodeKey, prevDisco, p.name,
p.verify, p.wait, p.ping)
}
}
// checkDiscoRotated verifies that after some action that should have rotated
// b's disco key, control has learned the new key, b's node key is unchanged,
// a's local view picks up the new disco key, and pingFrom can ping pingTo
// (TSMP) within the budget. It returns b's new disco key and fatals on
// failure.
//
// The TSMP ping budget is 15 seconds rather than the few hundred ms it
// ought to take. See the top-level test docstring for a full breakdown:
// it has to absorb LazyWG's trim+re-add for the rotate-a→b phase (~10s)
// and wireguard-go's RekeyTimeout retry for the SIGKILL+restart phases
// (~5s). Tighten this once both are addressed.
func checkDiscoRotated(t *testing.T, env *vmtest.Env, a, b, pingFrom, pingTo *vmtest.Node, bNodeKey key.NodePublic, oldDisco key.DiscoPublic, label string, verifyStep, waitStep, pingStep *vmtest.Step) key.DiscoPublic {
t.Helper()
cs := env.ControlServer()
verifyStep.Begin()
bSt := env.Status(b)
if got := bSt.Self.PublicKey; got != bNodeKey {
err := fmt.Errorf("[%s] b's node key changed: %v -> %v", label, bNodeKey, got)
verifyStep.End(err)
t.Fatal(err)
}
var newDisco key.DiscoPublic
if err := tstest.WaitFor(15*time.Second, func() error {
n := cs.Node(bNodeKey)
if n == nil {
return fmt.Errorf("control server has no node for b")
}
if n.DiscoKey.IsZero() || n.DiscoKey == oldDisco {
return fmt.Errorf("control still has old disco key %v for b", n.DiscoKey)
}
newDisco = n.DiscoKey
return nil
}); err != nil {
verifyStep.End(err)
t.Fatalf("[%s] %v", label, err)
}
t.Logf("[b] after %s: nodekey=%s discokey=%s", label, bNodeKey.ShortString(), newDisco.ShortString())
verifyStep.End(nil)
waitStep.Begin()
if err := tstest.WaitFor(30*time.Second, func() error {
d, ok, err := env.PeerDiscoKey(a, bNodeKey)
if err != nil {
return err
}
if !ok {
return fmt.Errorf("a doesn't yet have b in its status")
}
if d != newDisco {
return fmt.Errorf("a still sees b's old disco %v, want %v", d.ShortString(), newDisco.ShortString())
}
return nil
}); err != nil {
waitStep.End(err)
env.DumpStatus(a)
t.Fatalf("[%s] %v", label, err)
}
waitStep.End(nil)
pingStep.Begin()
t0 := time.Now()
if err := env.Ping(pingFrom, pingTo, tailcfg.PingTSMP, 15*time.Second); err != nil {
pingStep.End(err)
env.DumpStatus(a)
env.DumpStatus(b)
t.Fatalf("[%s] %v", label, err)
}
t.Logf("[%s] ping %s -> %s succeeded in %v", label, pingFrom.Name(), pingTo.Name(), time.Since(t0).Round(100*time.Millisecond))
pingStep.End(nil)
return newDisco
}
// TestMullvadExitNode verifies that a Tailscale client whose netmap contains
// a plain-WireGuard exit node (the way Mullvad exit nodes are wired up by
// the control plane) can route internet traffic through it, with the source