Files
tailscale/tstest/natlab/vmtest/vmtest_test.go
T
Brad Fitzpatrick e062b46984 tstest/natlab, .github/workflows: add opt-in natlab CI workflow
The natlab vmtest suite (tstest/natlab/vmtest) and the integration nat
tests are gated behind --run-vm-tests because they need KVM and are
slow. Until now nothing in CI exercised them apart from a single
canary TestEasyEasy run on every PR.

Add .github/workflows/natlab-test.yml that runs the full opt-in suite
on demand (workflow_dispatch), on PRs labeled "natlab", and on main
every 12 hours via cron. The workflow has two phases:

  - "prepare" builds the gokrazy VM image, downloads the Ubuntu and
    FreeBSD cloud images once via the new natlabprep tool, and emits
    a dynamic JSON matrix of every TestX function it finds in the two
    opt-in packages.
  - "test" is a per-test matrix that depends on prepare. Each matrix
    job restores the shared caches and runs a single test, so adding
    a new TestFoo is automatically picked up on the next run without
    any workflow edits.

Rename the existing natlab-integrationtest.yml to natlab-basic.yml
since it's the small smoke variant (just TestEasyEasy on every PR);
the new natlab-test.yml is the bigger suite. The job inside is
renamed to EasyEasy for the same reason.

Move the macOS arm64 host check from vmtest.Env.Start into
vmtest.Env.AddNode so a test that adds a vmtest.MacOS node skips
immediately on a non-macOS host, and add an explicit
skipIfNotMacOSArm64 helper at the top of the two macOS-only tests
so the platform requirement is obvious to readers.

Quiet the takeAgentConnOne miss log in tstest/natlab/vnet by default
(it was the overwhelming majority of bytes in CI logs, with no signal
in healthy runs) and replace it with a periodic "still waiting" line
that only fires after 10s, so a truly stuck agent connection still
surfaces.

Updates #13038

Change-Id: I4582098d8865200fd5a73a9b696942319ccf3bf0
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2026-05-11 17:14:46 -07:00

1057 lines
38 KiB
Go

// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package vmtest_test
import (
"bytes"
"fmt"
"net/netip"
"runtime"
"strings"
"testing"
"time"
"tailscale.com/client/local"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/tstest/integration/testcontrol"
"tailscale.com/tstest/natlab/vmtest"
"tailscale.com/tstest/natlab/vnet"
"tailscale.com/types/key"
"tailscale.com/types/netmap"
)
// skipIfNotMacOSArm64 skips the test when the host isn't a macOS arm64 host.
// macOS VM tests require Apple Virtualization.framework via tailmac.
// AddNode also enforces this when a macOS node is added, but having an
// explicit skip at the top of macOS-only tests makes the requirement
// obvious to readers.
func skipIfNotMacOSArm64(t *testing.T) {
t.Helper()
if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" {
t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH)
}
}
func TestMacOSAndLinuxCanPing(t *testing.T) {
skipIfNotMacOSArm64(t)
env := vmtest.New(t)
lan := env.AddNetwork("192.168.1.1/24")
linux := env.AddNode("linux", lan,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet())
macos := env.AddNode("macos", lan,
vmtest.OS(vmtest.MacOS),
vmtest.DontJoinTailnet())
env.Start()
env.LANPing(linux, macos.LanIP(lan))
}
func TestTwoMacOSVMsCanPing(t *testing.T) {
skipIfNotMacOSArm64(t)
env := vmtest.New(t)
lan := env.AddNetwork("192.168.1.1/24")
mac1 := env.AddNode("mac1", lan,
vmtest.OS(vmtest.MacOS),
vmtest.DontJoinTailnet())
mac2 := env.AddNode("mac2", lan,
vmtest.OS(vmtest.MacOS),
vmtest.DontJoinTailnet())
env.Start()
// Both macOS VMs have TTA. Ping from mac1 to mac2 and vice versa.
env.LANPing(mac1, mac2.LanIP(lan))
env.LANPing(mac2, mac1.LanIP(lan))
}
func TestSubnetRouter(t *testing.T) {
testSubnetRouterForOS(t, vmtest.Ubuntu2404)
}
func TestSubnetRouterFreeBSD(t *testing.T) {
testSubnetRouterForOS(t, vmtest.FreeBSD150)
}
func testSubnetRouterForOS(t testing.TB, srOS vmtest.OSImage) {
t.Helper()
env := vmtest.New(t)
clientNet := env.AddNetwork("2.1.1.1", "192.168.1.1/24", "2000:1::1/64", vnet.EasyNAT)
internalNet := env.AddNetwork("10.0.0.1/24", "2000:2::1/64")
client := env.AddNode("client", clientNet,
vmtest.OS(vmtest.Gokrazy))
sr := env.AddNode("subnet-router", clientNet, internalNet,
vmtest.OS(srOS),
vmtest.AdvertiseRoutes("10.0.0.0/24"))
backend := env.AddNode("backend", internalNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
// Declare test-specific steps for the web UI.
approveStep := env.AddStep("Approve subnet routes")
httpStep := env.AddStep("HTTP GET through subnet router")
env.Start()
approveStep.Begin()
env.ApproveRoutes(sr, "10.0.0.0/24")
approveStep.End(nil)
httpStep.Begin()
body := env.HTTPGet(client, fmt.Sprintf("http://%s:8080/", backend.LanIP(internalNet)))
if !strings.Contains(body, "Hello world I am backend") {
httpStep.End(fmt.Errorf("got %q", body))
t.Fatalf("got %q", body)
}
httpStep.End(nil)
}
func TestSiteToSite(t *testing.T) {
testSiteToSite(t, vmtest.Ubuntu2404)
}
// testSiteToSite runs a site-to-site subnet routing test with
// --snat-subnet-routes=false, verifying that original source IPs are preserved
// across Tailscale subnet routes.
//
// Topology:
//
// Site A: backend-a (10.1.0.0/24) ← → sr-a (WAN + LAN-A)
// Site B: backend-b (10.2.0.0/24) ← → sr-b (WAN + LAN-B)
//
// Both subnet routers are on Tailscale with --snat-subnet-routes=false.
// The test sends HTTP from backend-a to backend-b through the subnet routers
// and verifies that backend-b sees backend-a's LAN IP (not the subnet router's).
func testSiteToSite(t *testing.T, srOS vmtest.OSImage) {
env := vmtest.New(t)
// WAN networks for each site (each behind NAT).
wanA := env.AddNetwork("2.1.1.1", "192.168.1.1/24", vnet.EasyNAT)
wanB := env.AddNetwork("3.1.1.1", "192.168.2.1/24", vnet.EasyNAT)
// Internal LAN for each site.
lanA := env.AddNetwork("10.1.0.1/24")
lanB := env.AddNetwork("10.2.0.1/24")
// Subnet routers: each on its WAN + LAN, advertising the local LAN,
// with SNAT disabled to preserve source IPs.
srA := env.AddNode("sr-a", wanA, lanA,
vmtest.OS(srOS),
vmtest.AdvertiseRoutes("10.1.0.0/24"),
vmtest.SNATSubnetRoutes(false))
srB := env.AddNode("sr-b", wanB, lanB,
vmtest.OS(srOS),
vmtest.AdvertiseRoutes("10.2.0.0/24"),
vmtest.SNATSubnetRoutes(false))
// Backend servers on each site's LAN (not on Tailscale).
// Use Ubuntu so we can SSH in to add static routes.
backendA := env.AddNode("backend-a", lanA,
vmtest.OS(vmtest.Ubuntu2404),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
backendB := env.AddNode("backend-b", lanB,
vmtest.OS(vmtest.Ubuntu2404),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
// Declare test-specific steps for the web UI.
approveStep := env.AddStep("Approve subnet routes (sr-a, sr-b)")
staticRouteStep := env.AddStep("Add static routes on backends")
httpStep := env.AddStep("HTTP GET through site-to-site")
env.Start()
approveStep.Begin()
env.ApproveRoutes(srA, "10.1.0.0/24")
env.ApproveRoutes(srB, "10.2.0.0/24")
approveStep.End(nil)
// Add static routes on the backends so that traffic to the remote site's
// subnet goes through the local subnet router. This mirrors how a real
// site-to-site deployment is configured.
srALanIP := srA.LanIP(lanA).String()
srBLanIP := srB.LanIP(lanB).String()
t.Logf("sr-a LAN IP: %s, sr-b LAN IP: %s", srALanIP, srBLanIP)
t.Logf("backend-a LAN IP: %s, backend-b LAN IP: %s", backendA.LanIP(lanA), backendB.LanIP(lanB))
staticRouteStep.Begin()
env.AddRoute(backendA, "10.2.0.0/24", srALanIP)
env.AddRoute(backendB, "10.1.0.0/24", srBLanIP)
staticRouteStep.End(nil)
// Make an HTTP request from backend-a to backend-b through the subnet routers.
// TTA's /http-get falls back to direct dial on non-Tailscale nodes.
httpStep.Begin()
backendBIP := backendB.LanIP(lanB)
body := env.HTTPGet(backendA, fmt.Sprintf("http://%s:8080/", backendBIP))
t.Logf("response: %s", body)
if !strings.Contains(body, "Hello world I am backend-b") {
httpStep.End(fmt.Errorf("expected response from backend-b, got %q", body))
t.Fatalf("expected response from backend-b, got %q", body)
}
// Verify the source IP was preserved. With --snat-subnet-routes=false,
// backend-b should see backend-a's LAN IP as the source, not sr-b's LAN IP.
backendAIP := backendA.LanIP(lanA).String()
if !strings.Contains(body, "from "+backendAIP) {
httpStep.End(fmt.Errorf("source IP not preserved: expected %q in response, got %q", backendAIP, body))
t.Fatalf("source IP not preserved: expected %q in response, got %q", backendAIP, body)
}
httpStep.End(nil)
}
// TestInterNetworkTCP verifies that vnet routes raw TCP between simulated
// networks: a non-Tailscale VM on one NAT'd LAN can reach a webserver on a
// different network using a 1:1 NAT, and the webserver sees the client's
// network's WAN IP as the source (post-NAT).
func TestInterNetworkTCP(t *testing.T) {
env := vmtest.New(t)
const (
clientWAN = "1.0.0.1"
webWAN = "5.0.0.1"
)
clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT)
webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT)
client := env.AddNode("client", clientNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet())
env.AddNode("webserver", webNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
// Declare test-specific steps for the web UI.
httpStep := env.AddStep("HTTP GET across networks via NAT")
env.Start()
httpStep.Begin()
body := env.HTTPGet(client, fmt.Sprintf("http://%s:8080/", webWAN))
t.Logf("response: %s", body)
if !strings.Contains(body, "Hello world I am webserver") {
httpStep.End(fmt.Errorf("unexpected response: %q", body))
t.Fatalf("unexpected response: %q", body)
}
if !strings.Contains(body, "from "+clientWAN) {
httpStep.End(fmt.Errorf("expected source %q in response, got %q", clientWAN, body))
t.Fatalf("expected source %q in response, got %q", clientWAN, body)
}
httpStep.End(nil)
}
// TestSubnetRouterPublicIP verifies that toggling --accept-routes on the
// client switches between dialing a webserver directly and routing through a
// subnet router that advertises the webserver's public IP range.
//
// Topology: client, subnet router, and webserver each live behind their own
// NAT'd network with distinct WAN IPs; the subnet router advertises the
// webserver's network as a route. The webserver echoes the source IP it
// sees:
// - accept-routes=off: client dials webserver directly; source is client's WAN.
// - accept-routes=on: client tunnels to the subnet router, which forwards
// and SNATs; source is subnet router's WAN.
func TestSubnetRouterPublicIP(t *testing.T) {
env := vmtest.New(t)
const (
clientWAN = "1.0.0.1"
routerWAN = "2.0.0.1"
webWAN = "5.0.0.1"
webRoute = "5.0.0.0/24"
)
clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT)
routerNet := env.AddNetwork(routerWAN, "192.168.2.1/24", vnet.EasyNAT)
webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT)
client := env.AddNode("client", clientNet,
vmtest.OS(vmtest.Gokrazy))
sr := env.AddNode("subnet-router", routerNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.AdvertiseRoutes(webRoute))
env.AddNode("webserver", webNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
// Declare test-specific steps for the web UI.
approveStep := env.AddStep("Approve subnet route (public IP)")
checkOn1Step := env.AddStep("HTTP GET (accept-routes=on)")
checkOffStep := env.AddStep("HTTP GET (accept-routes=off)")
checkOn2Step := env.AddStep("HTTP GET (accept-routes=on, again)")
env.Start()
// ApproveRoutes also turns on RouteAll on the client.
approveStep.Begin()
env.ApproveRoutes(sr, webRoute)
approveStep.End(nil)
webURL := fmt.Sprintf("http://%s:8080/", webWAN)
check := func(step *vmtest.Step, label, wantSrc string) {
t.Helper()
step.Begin()
body := env.HTTPGet(client, webURL)
t.Logf("[%s] response: %s", label, body)
if !strings.Contains(body, "Hello world I am webserver") {
step.End(fmt.Errorf("[%s] unexpected webserver response: %q", label, body))
t.Fatalf("[%s] unexpected webserver response: %q", label, body)
}
if !strings.Contains(body, "from "+wantSrc) {
step.End(fmt.Errorf("[%s] expected source %q in response, got %q", label, wantSrc, body))
t.Fatalf("[%s] expected source %q in response, got %q", label, wantSrc, body)
}
step.End(nil)
}
// accept-routes=on (set by ApproveRoutes): traffic flows via the subnet router.
check(checkOn1Step, "accept-routes=on", routerWAN)
// accept-routes=off: client dials the webserver directly.
env.SetAcceptRoutes(client, false)
check(checkOffStep, "accept-routes=off", clientWAN)
// Toggle back on to confirm the transition works in both directions.
env.SetAcceptRoutes(client, true)
check(checkOn2Step, "accept-routes=on (again)", routerWAN)
}
// TestSubnetRouterAndExitNode checks how the subnet router and exit node
// preferences interact. Topology: client, subnet router, exit node, and
// webserver, each on its own NAT'd network with distinct WAN IPs. The subnet
// router advertises the webserver's network (5.0.0.0/24); the exit node
// advertises 0.0.0.0/0 + ::/0. The webserver echoes the source IP it sees:
//
// exit=off, subnet=off → client's WAN (direct dial)
// exit=off, subnet=on → subnet router's WAN
// exit=on, subnet=off → exit node's WAN
// exit=on, subnet=on → subnet router's WAN (more-specific /24 beats /0)
func TestSubnetRouterAndExitNode(t *testing.T) {
env := vmtest.New(t)
const (
clientWAN = "1.0.0.1"
routerWAN = "2.0.0.1"
exitWAN = "3.0.0.1"
webWAN = "5.0.0.1"
webRoute = "5.0.0.0/24"
)
clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT)
routerNet := env.AddNetwork(routerWAN, "192.168.2.1/24", vnet.EasyNAT)
exitNet := env.AddNetwork(exitWAN, "192.168.3.1/24", vnet.EasyNAT)
webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT)
client := env.AddNode("client", clientNet,
vmtest.OS(vmtest.Gokrazy))
sr := env.AddNode("subnet-router", routerNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.AdvertiseRoutes(webRoute))
exit := env.AddNode("exit", exitNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.AdvertiseRoutes("0.0.0.0/0,::/0"))
env.AddNode("webserver", webNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
// Declare test-specific steps for the web UI.
approveStep := env.AddStep("Approve subnet & exit routes")
webURL := fmt.Sprintf("http://%s:8080/", webWAN)
tests := []struct {
name string // subtest name; describes (exit, subnet) toggles
exit *vmtest.Node
subnet bool
wantSrc string
step *vmtest.Step
}{
{"exit-off,subnet-off", nil, false, clientWAN, nil},
{"exit-off,subnet-on", nil, true, routerWAN, nil},
{"exit-on,subnet-off", exit, false, exitWAN, nil},
// More-specific 5.0.0.0/24 from sr beats 0.0.0.0/0 from exit.
{"exit-on,subnet-on", exit, true, routerWAN, nil},
}
for i := range tests {
tests[i].step = env.AddStep("HTTP GET: " + tests[i].name)
}
env.Start()
approveStep.Begin()
env.ApproveRoutes(sr, webRoute)
env.ApproveRoutes(exit, "0.0.0.0/0", "::/0")
// Don't let the exit node itself forward via the subnet router: when the
// client is using the exit node only, we want the exit node to egress to
// the simulated internet directly so the webserver sees the exit's WAN.
env.SetAcceptRoutes(exit, false)
approveStep.End(nil)
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
tc.step.Begin()
env.SetExitNode(client, tc.exit)
env.SetAcceptRoutes(client, tc.subnet)
body := env.HTTPGet(client, webURL)
t.Logf("response: %s", body)
if !strings.Contains(body, "Hello world I am webserver") {
tc.step.End(fmt.Errorf("unexpected webserver response: %q", body))
t.Fatalf("unexpected webserver response: %q", body)
}
if !strings.Contains(body, "from "+tc.wantSrc) {
tc.step.End(fmt.Errorf("expected source %q in response, got %q", tc.wantSrc, body))
t.Fatalf("expected source %q in response, got %q", tc.wantSrc, body)
}
tc.step.End(nil)
})
}
}
// TestTaildrop verifies that one Ubuntu node can send a file to another
// Ubuntu node via Taildrop, and the receiver gets the same content.
//
// Topology: two Ubuntu nodes, each behind its own EasyNAT, both joined to the
// tailnet. The sender runs `tailscale file cp` to push to the receiver's
// Tailscale IP; the receiver then runs `tailscale file get --wait` to fetch
// it.
func TestTaildrop(t *testing.T) {
env := vmtest.New(t, vmtest.SameTailnetUser())
senderNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.EasyNAT)
receiverNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.EasyNAT)
sender := env.AddNode("sender", senderNet,
vmtest.OS(vmtest.Ubuntu2404))
receiver := env.AddNode("receiver", receiverNet,
vmtest.OS(vmtest.Ubuntu2404))
// Declare test-specific steps for the web UI.
sendStep := env.AddStep("Taildrop send (sender -> receiver)")
recvStep := env.AddStep("Taildrop receive (on receiver)")
verifyStep := env.AddStep("Verify received name and contents")
env.Start()
const filename = "hello.txt"
want := []byte("hello world this is a Taildrop test\n")
sendStep.Begin()
env.SendTaildropFile(sender, receiver, filename, want)
sendStep.End(nil)
recvStep.Begin()
gotName, gotContent := env.RecvTaildropFile(t.Context(), receiver)
recvStep.End(nil)
verifyStep.Begin()
if gotName != filename {
err := fmt.Errorf("received name = %q; want %q", gotName, filename)
verifyStep.End(err)
t.Error(err)
return
}
if !bytes.Equal(gotContent, want) {
err := fmt.Errorf("received content = %q; want %q", gotContent, want)
verifyStep.End(err)
t.Error(err)
return
}
verifyStep.End(nil)
}
// TestExitNode verifies that switching the client's exit node setting between
// off, exit1, and exit2 correctly routes the client's internet traffic.
//
// Topology: each of the client and the two exit nodes lives behind its own NAT
// with a unique WAN IP, and a webserver lives on yet another network using a
// 1:1 NAT so it's reachable from the simulated internet at a stable address.
// The webserver echoes the source IP of incoming requests, so we can tell
// which network's NAT the client's traffic egressed through:
// - off: source is the client's network WAN IP.
// - exit1: source is exit1's network WAN IP.
// - exit2: source is exit2's network WAN IP.
func TestExitNode(t *testing.T) {
env := vmtest.New(t)
const (
clientWAN = "1.0.0.1"
exit1WAN = "2.0.0.1"
exit2WAN = "3.0.0.1"
webWAN = "5.0.0.1"
)
clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT)
exit1Net := env.AddNetwork(exit1WAN, "192.168.2.1/24", vnet.EasyNAT)
exit2Net := env.AddNetwork(exit2WAN, "192.168.3.1/24", vnet.EasyNAT)
webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT)
client := env.AddNode("client", clientNet,
vmtest.OS(vmtest.Gokrazy))
exit1 := env.AddNode("exit1", exit1Net,
vmtest.OS(vmtest.Gokrazy),
vmtest.AdvertiseRoutes("0.0.0.0/0,::/0"))
exit2 := env.AddNode("exit2", exit2Net,
vmtest.OS(vmtest.Gokrazy),
vmtest.AdvertiseRoutes("0.0.0.0/0,::/0"))
env.AddNode("webserver", webNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
// Declare test-specific steps for the web UI.
approveStep := env.AddStep("Approve exit-node routes (exit1, exit2)")
webURL := fmt.Sprintf("http://%s:8080/", webWAN)
tests := []struct {
name string // subtest name
exit *vmtest.Node
wantSrc string
step *vmtest.Step
}{
{"off", nil, clientWAN, nil},
{"exit1", exit1, exit1WAN, nil},
{"exit2", exit2, exit2WAN, nil},
}
for i := range tests {
tests[i].step = env.AddStep("HTTP GET: exit=" + tests[i].name)
}
env.Start()
approveStep.Begin()
env.ApproveRoutes(exit1, "0.0.0.0/0", "::/0")
env.ApproveRoutes(exit2, "0.0.0.0/0", "::/0")
approveStep.End(nil)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.step.Begin()
env.SetExitNode(client, tt.exit)
body := env.HTTPGet(client, webURL)
t.Logf("response: %s", body)
if !strings.Contains(body, "Hello world I am webserver") {
tt.step.End(fmt.Errorf("unexpected webserver response: %q", body))
t.Fatalf("unexpected webserver response: %q", body)
}
if !strings.Contains(body, "from "+tt.wantSrc) {
tt.step.End(fmt.Errorf("expected source %q in response, got %q", tt.wantSrc, body))
t.Fatalf("expected source %q in response, got %q", tt.wantSrc, body)
}
tt.step.End(nil)
})
}
}
// TestDiscoKeyChange verifies that when one node's disco key rotates without
// its WireGuard node key changing, peers detect the change, tear down stale
// WireGuard session state for that peer, and re-establish the tunnel in both
// directions. This exercises the disco-key-change handling that the
// bradfitz/rm_lazy_wg branch relies on for traffic to and from a peer whose
// magicsock state has been reset.
//
// Topology: two gokrazy nodes A and B, each on its own One2OneNAT network so
// every connection between them is a direct UDP path with no port-mapping or
// filtering. With NAT effects out of the way, what we measure here is the
// speed of disco-key-change reconciliation in wgengine/magicsock alone. The
// test control server is also configured with [testcontrol.Server.AllOnline]
// (via [vmtest.AllOnline]) so the controlclient/wgengine fast paths that
// branch on Online actually fire — without that flag the test exercises
// only the offline-peer code paths, which mask separate latent issues and
// are several seconds slower.
//
// The test runs four B-side rotations followed by a TSMP ping in the
// requested direction:
//
// rotate (LocalAPI rotate-disco-key) → ping B → A
// rotate (LocalAPI rotate-disco-key) → ping A → B
// restart (SIGKILL tailscaled) → ping B → A
// restart (SIGKILL tailscaled) → ping A → B
//
// Plus an initial A→B TSMP ping with a generous 30s budget to bring up the
// WireGuard tunnel before the rotations begin (so the post-rotation pings
// measure stale-state recovery, not first-time setup). All pings are TSMP
// because TSMP traverses the actual WireGuard data plane; PingDisco only
// exercises the magicsock disco layer and would mask any stale WG session
// problems.
//
// Two rotation methods are exercised:
//
// - LocalAPI rotate-disco-key (debug action): rolls B's magicsock disco
// private key in place, then bounces WantRunning to force wgengine to
// drop wireguard-go session keys for every peer (RotateDiscoKey alone
// only touches local disco state; without the WantRunning bounce, B
// keeps using stale per-peer session keys against A and A drops
// everything until B's WG rekey timer eventually fires).
// - SIGKILL of tailscaled (via TTA's /kill-tailscaled): the gokrazy
// supervisor respawns tailscaled, fully resetting B's magicsock and
// wgengine state in addition to rotating the disco key.
//
// Each post-rotation ping currently gets a 15-second budget. On a
// hypothetical perfect build it should take well under a second. In
// practice today there are two unavoidable multi-second waits:
//
// - The rotate-then-a→b phase on main takes ~10s for LazyWG. After
// B's WantRunning bounce, B's wgengine resets its sentActivityAt/
// recvActivityAt maps and trims A out of the wireguard-go config
// as an "idle peer"; B only re-adds A on inbound activity, by
// which point A's first few TSMP packets have been silently
// dropped at B's tundev. The bradfitz/rm_lazy_wg branch removes
// that trimming entirely (verified locally), so this phase will
// drop to <100ms once that branch lands.
//
// - The restart phases take ~5s for the wireguard-go handshake retry
// timer. After SIGKILL+respawn the first WG handshake init from
// the restarted node sometimes goes into the void (likely the
// brief peer-removed window in the receiver's two-step
// [wgengine.userspaceEngine.maybeReconfigWireguardLocked] reconfig
// during which the peer is absent from wireguard-go), and wg-go's
// [device.RekeyTimeout] of 5s + jitter is the next opportunity to
// retry. That retry succeeds and the staged TSMP packet flushes.
// This is intrinsic to the protocol's retransmit policy.
//
// Once LazyWG is removed and the first-handshake-after-reconfig race
// is fixed, this budget should be tightened to 5s (or less).
//
// All four rotations also assert that B's WireGuard node key is unchanged.
func TestDiscoKeyChange(t *testing.T) {
// AllOnline makes the test control server mark every peer as Online=true
// in its MapResponses. Several disco-key handling fast paths
// (controlclient.removeUnwantedDiscoUpdates,
// removeUnwantedDiscoUpdatesFromFullNetmapUpdate, and the wgengine
// tsmpLearnedDisco fast path) only fire for online peers. Production
// control servers always populate Online; without this flag the test
// would only exercise the offline-peer paths.
env := vmtest.New(t, vmtest.AllOnline())
// One2OneNAT so each node has a 1:1 mapping to a public WAN IP with no
// port-translation or address-port filtering. This makes A↔B traffic
// behave like two unfirewalled hosts on the public internet, so any
// slowness we observe in this test cannot be blamed on NAT traversal.
aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.One2OneNAT)
bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.One2OneNAT)
a := env.AddNode("a", aNet, vmtest.OS(vmtest.Gokrazy))
b := env.AddNode("b", bNet, vmtest.OS(vmtest.Gokrazy))
type phase struct {
name string
rotate func()
pingFrom *vmtest.Node
pingTo *vmtest.Node
applyStep *vmtest.Step
verify *vmtest.Step
wait *vmtest.Step
ping *vmtest.Step
}
phases := []*phase{
{name: "rotate (LocalAPI), b → a", pingFrom: b, pingTo: a, rotate: func() { env.RotateDiscoKey(b) }},
{name: "rotate (LocalAPI), a → b", pingFrom: a, pingTo: b, rotate: func() { env.RotateDiscoKey(b) }},
{name: "restart, b → a", pingFrom: b, pingTo: a, rotate: func() { env.RestartTailscaled(b) }},
{name: "restart, a → b", pingFrom: a, pingTo: b, rotate: func() { env.RestartTailscaled(b) }},
}
pingABStep := env.AddStep("Ping a → b TSMP (establish tunnel)")
for _, p := range phases {
p.applyStep = env.AddStep("Apply: " + p.name)
p.verify = env.AddStep("Verify b: same node key, new disco key (" + p.name + ")")
p.wait = env.AddStep("Wait for a to see b's new disco key (" + p.name + ")")
p.ping = env.AddStep("Ping " + p.pingFrom.Name() + " → " + p.pingTo.Name() + " TSMP (" + p.name + ")")
}
env.Start()
pingABStep.Begin()
if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil {
pingABStep.End(err)
t.Fatal(err)
}
pingABStep.End(nil)
bStInitial := env.Status(b)
bNodeKey := bStInitial.Self.PublicKey
cs := env.ControlServer()
bCtlNode := cs.Node(bNodeKey)
if bCtlNode == nil {
t.Fatalf("control server has no node for b's key %v", bNodeKey)
}
prevDisco := bCtlNode.DiscoKey
if prevDisco.IsZero() {
t.Fatalf("control server has no disco key for b before rotation")
}
t.Logf("[b] initial: nodekey=%s discokey=%s", bNodeKey.ShortString(), prevDisco.ShortString())
for _, p := range phases {
p.applyStep.Begin()
p.rotate()
p.applyStep.End(nil)
prevDisco = checkDiscoRotated(t, env, a, b, p.pingFrom, p.pingTo, bNodeKey, prevDisco, p.name,
p.verify, p.wait, p.ping)
}
}
// checkDiscoRotated verifies that after some action that should have rotated
// b's disco key, control has learned the new key, b's node key is unchanged,
// a's local view picks up the new disco key, and pingFrom can ping pingTo
// (TSMP) within the budget. It returns b's new disco key and fatals on
// failure.
//
// The TSMP ping budget is 15 seconds rather than the few hundred ms it
// ought to take. See the top-level test docstring for a full breakdown:
// it has to absorb LazyWG's trim+re-add for the rotate-a→b phase (~10s)
// and wireguard-go's RekeyTimeout retry for the SIGKILL+restart phases
// (~5s). Tighten this once both are addressed.
func checkDiscoRotated(t *testing.T, env *vmtest.Env, a, b, pingFrom, pingTo *vmtest.Node, bNodeKey key.NodePublic, oldDisco key.DiscoPublic, label string, verifyStep, waitStep, pingStep *vmtest.Step) key.DiscoPublic {
t.Helper()
cs := env.ControlServer()
verifyStep.Begin()
bSt := env.Status(b)
if got := bSt.Self.PublicKey; got != bNodeKey {
err := fmt.Errorf("[%s] b's node key changed: %v -> %v", label, bNodeKey, got)
verifyStep.End(err)
t.Fatal(err)
}
var newDisco key.DiscoPublic
if err := tstest.WaitFor(15*time.Second, func() error {
n := cs.Node(bNodeKey)
if n == nil {
return fmt.Errorf("control server has no node for b")
}
if n.DiscoKey.IsZero() || n.DiscoKey == oldDisco {
return fmt.Errorf("control still has old disco key %v for b", n.DiscoKey)
}
newDisco = n.DiscoKey
return nil
}); err != nil {
verifyStep.End(err)
t.Fatalf("[%s] %v", label, err)
}
t.Logf("[b] after %s: nodekey=%s discokey=%s", label, bNodeKey.ShortString(), newDisco.ShortString())
verifyStep.End(nil)
waitStep.Begin()
if err := tstest.WaitFor(30*time.Second, func() error {
d, ok, err := env.PeerDiscoKey(a, bNodeKey)
if err != nil {
return err
}
if !ok {
return fmt.Errorf("a doesn't yet have b in its status")
}
if d != newDisco {
return fmt.Errorf("a still sees b's old disco %v, want %v", d.ShortString(), newDisco.ShortString())
}
return nil
}); err != nil {
waitStep.End(err)
env.DumpStatus(a)
t.Fatalf("[%s] %v", label, err)
}
waitStep.End(nil)
pingStep.Begin()
t0 := time.Now()
if err := env.Ping(pingFrom, pingTo, tailcfg.PingTSMP, 15*time.Second); err != nil {
pingStep.End(err)
env.DumpStatus(a)
env.DumpStatus(b)
t.Fatalf("[%s] %v", label, err)
}
t.Logf("[%s] ping %s -> %s succeeded in %v", label, pingFrom.Name(), pingTo.Name(), time.Since(t0).Round(100*time.Millisecond))
pingStep.End(nil)
return newDisco
}
// TestMullvadExitNode verifies that a Tailscale client whose netmap contains
// a plain-WireGuard exit node (the way Mullvad exit nodes are wired up by
// the control plane) can route internet traffic through it, with the source
// IP rewritten to the per-client Mullvad-assigned address.
//
// Topology:
//
// client (Tailscale, gokrazy) — clientNet (EasyNAT) WAN 1.0.0.1
// mullvad (Ubuntu, userspace WG) — mullvadNet (One2OneNAT) WAN 2.0.0.1
// webserver (no Tailscale, gokrazy) — webNet (One2OneNAT) WAN 5.0.0.1
//
// The mullvad VM impersonates a Mullvad WireGuard server. After boot, the
// test asks its TTA agent to bring up a userspace WireGuard interface (a
// real Linux TUN driven by wireguard-go) that pins the client's Tailscale
// node public key as its only allowed peer, sets up IP-forwarding + a
// MASQUERADE rule, and reports the WG server's freshly generated public
// key back. Userspace vs kernel WireGuard makes no difference on the wire
// — what's being tested is Tailscale's plain-WireGuard exit-node code
// path, not the kernel module.
//
// The test then injects a netmap peer with IsWireGuardOnly=true,
// AllowedIPs=[gw/32, 0.0.0.0/0, ::/0], the WG endpoint, and a per-client
// SelfNodeV4MasqAddrForThisPeer (the mock equivalent of the per-client IP
// Mullvad's API hands out at registration time).
//
// The webserver echoes the source IP it sees:
// - exit-node off: source is client's WAN (direct egress)
// - exit-node on: source is mullvad's WAN (egress via WG + MASQUERADE)
func TestMullvadExitNode(t *testing.T) {
env := vmtest.New(t)
const (
clientWAN = "1.0.0.1"
mullvadWAN = "2.0.0.1"
webWAN = "5.0.0.1"
)
// Mullvad-side WG network. The client appears as clientMasqIP to
// mullvad's wg0; mullvad terminates the tunnel at gw.
var (
mullvadWGNet = netip.MustParsePrefix("10.64.0.0/24")
gw = netip.MustParsePrefix("10.64.0.1/24")
clientMasq = netip.MustParsePrefix("10.64.0.2/32")
)
const wgListenPort uint16 = 51820
clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT)
mullvadNet := env.AddNetwork(mullvadWAN, "192.168.2.1/24", vnet.One2OneNAT)
webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT)
client := env.AddNode("client", clientNet, vmtest.OS(vmtest.Gokrazy))
mullvad := env.AddNode("mullvad", mullvadNet,
vmtest.OS(vmtest.Ubuntu2404),
vmtest.DontJoinTailnet())
env.AddNode("webserver", webNet,
vmtest.OS(vmtest.Gokrazy),
vmtest.DontJoinTailnet(),
vmtest.WebServer(8080))
// Declare test-specific steps for the web UI.
wgUpStep := env.AddStep("Bring up Mullvad WG server")
injectStep := env.AddStep("Inject Mullvad netmap peer")
checkOff1Step := env.AddStep("HTTP GET (exit off)")
checkMullvadStep := env.AddStep("HTTP GET (exit=mullvad)")
checkOff2Step := env.AddStep("HTTP GET (exit off, again)")
env.Start()
// Bring up the WG server inside mullvad's TTA, pinning the client's
// Tailscale node public key as the sole allowed peer.
wgUpStep.Begin()
clientStatus := env.Status(client)
mullvadPub := env.BringUpMullvadWGServer(mullvad,
gw, wgListenPort,
clientStatus.Self.PublicKey, clientMasq, mullvadWGNet)
wgUpStep.End(nil)
// Inject the mullvad node into the netmap as a plain-WireGuard exit
// node. This mirrors how the control plane describes Mullvad exit
// nodes to clients (see control/cmullvad in the closed repo): a
// peer with IsWireGuardOnly=true, an Endpoints entry pointing at
// the public WG host:port, and AllowedIPs covering both the gateway
// /32 and the 0.0.0.0/0+::/0 exit-node routes.
injectStep.Begin()
mullvadEndpoint := netip.AddrPortFrom(netip.MustParseAddr(mullvadWAN), wgListenPort)
gwHost := netip.PrefixFrom(gw.Addr(), gw.Addr().BitLen())
mullvadNode := &tailcfg.Node{
ID: 999_001,
StableID: "mullvad-test",
Name: "mullvad-test.fake-control.example.net.",
Key: mullvadPub,
MachineAuthorized: true,
IsWireGuardOnly: true,
Endpoints: []netip.AddrPort{mullvadEndpoint},
Addresses: []netip.Prefix{gwHost},
AllowedIPs: []netip.Prefix{
gwHost,
netip.MustParsePrefix("0.0.0.0/0"),
netip.MustParsePrefix("::/0"),
},
Hostinfo: (&tailcfg.Hostinfo{
Hostname: "mullvad-test",
}).View(),
}
cs := env.ControlServer()
cs.UpdateNode(mullvadNode)
// Set the per-peer source-IP masquerade. The control plane normally
// derives this from the Mullvad API's per-client registration; here
// we just pin it to the address mullvad's wg0 was told to accept.
cs.SetMasqueradeAddresses([]testcontrol.MasqueradePair{{
Node: clientStatus.Self.PublicKey,
Peer: mullvadPub,
NodeMasqueradesAs: clientMasq.Addr(),
}})
injectStep.End(nil)
webURL := fmt.Sprintf("http://%s:8080/", webWAN)
check := func(step *vmtest.Step, label, wantSrc string) {
t.Helper()
step.Begin()
body := env.HTTPGet(client, webURL)
t.Logf("[%s] response: %s", label, body)
if !strings.Contains(body, "Hello world I am webserver") {
step.End(fmt.Errorf("[%s] unexpected webserver response: %q", label, body))
t.Fatalf("[%s] unexpected webserver response: %q", label, body)
}
if !strings.Contains(body, "from "+wantSrc) {
step.End(fmt.Errorf("[%s] expected source %q in response, got %q", label, wantSrc, body))
t.Fatalf("[%s] expected source %q in response, got %q", label, wantSrc, body)
}
step.End(nil)
}
// Exit-node off: client routes 0.0.0.0/0 directly via its host stack,
// so the webserver sees client's WAN IP.
check(checkOff1Step, "exit-off", clientWAN)
// Switch to the Mullvad WG-only peer as exit node. The client should
// now route 0.0.0.0/0 through the WG tunnel; mullvad MASQUERADEs to
// its WAN; the webserver sees the mullvad VM's WAN IP.
env.SetExitNodeIP(client, gw.Addr())
check(checkMullvadStep, "exit-mullvad", mullvadWAN)
// And back off again, to make sure the transition works in both
// directions.
env.SetExitNodeIP(client, netip.Addr{})
check(checkOff2Step, "exit-off (again)", clientWAN)
}
// TestCachedNetmapAfterRestart verifies that two nodes with netmap
// caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct
// WireGuard tunnel after both are restarted while the control server is
// unreachable. After restart the nodes must use only their on-disk cached
// netmaps to re-connect.
func TestCachedNetmapAfterRestart(t *testing.T) {
env := vmtest.New(t)
aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.EasyNAT)
bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.EasyNAT)
a := env.AddNode("a", aNet,
vmtest.OS(vmtest.Gokrazy),
tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil})
b := env.AddNode("b", bNet,
vmtest.OS(vmtest.Gokrazy),
tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil})
connectStep := env.AddStep("Establish initial TSMP tunnel")
cutControlStep := env.AddStep("Cut control server access")
restartStep := env.AddStep("Restart tailscaled on both nodes")
netmapCheckStep := env.AddStep("Check netmap loaded is cached")
pingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)")
env.Start()
connectStep.Begin()
if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil {
connectStep.End(err)
t.Fatal(err)
}
connectStep.End(nil)
cutControlStep.Begin()
// Both nodes lose connection to control
a.DropControlTraffic()
b.DropControlTraffic()
env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) {
panic(fmt.Sprintf("got connection from %v", nk))
})
cutControlStep.End(nil)
restartStep.Begin()
env.RestartTailscaled(a)
env.RestartTailscaled(b)
restartStep.End(nil)
netmapCheckStep.Begin()
for _, node := range []*vmtest.Node{a, b} {
nm, err := local.GetDebugResultJSON[netmap.NetworkMap](t.Context(), node.Agent().Client, "current-netmap")
if err != nil {
netmapCheckStep.End(fmt.Errorf("[%s] got err fetching netmap %q", node.Name(), err))
t.Fatalf("[%s] got err fetching netmap %q", node.Name(), err)
}
if !nm.Cached {
netmapCheckStep.End(fmt.Errorf("[%s] expected netmap.Cached = true, got: %t", node.Name(), nm.Cached))
t.Fatalf("[%s] expected netmap.Cached = true, got: %t", node.Name(), nm.Cached)
}
}
netmapCheckStep.End(nil)
// 90s is generous on purpose. After both nodes restart with stale cached
// netmap entries, a's first WG handshake to b's pre-restart endpoint
// hits the dead NAT mapping on b's side and is silently dropped (we
// see this as "no recent outgoing packet" NAT drops in the vnet log).
// Recovery then waits on wireguard-go's REKEY_TIMEOUT (~5s) before the
// next handshake attempt, and on disco-via-DERP to teach each side the
// other's new endpoint. On an idle host this converges in well under
// 15s; on a contended host (a 14/16-CPU-loaded local repro, or any
// shared CI runner) the same sequence has been observed at 50-60s
// because every timer fires multiple times under scheduling jitter.
pingStep.Begin()
if err := env.Ping(a, b, tailcfg.PingTSMP, 90*time.Second); err != nil {
pingStep.End(err)
t.Fatal(err)
}
pingStep.End(nil)
}
// TestDirectConnectionWithCachedNetmap verifies that two nodes with netmap
// caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct
// WireGuard tunnel after one is restarted while the control server is
// unreachable. After restart the node must use only its on-disk cached
// netmaps to re-connect and ping the other (still online) node.
func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) {
env := vmtest.New(t)
aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.EasyNAT)
bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.EasyNAT)
a := env.AddNode("a", aNet,
vmtest.OS(vmtest.Gokrazy),
tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil})
b := env.AddNode("b", bNet,
vmtest.OS(vmtest.Gokrazy),
tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil})
cutControlStep := env.AddStep("Cut control server access")
restartStep := env.AddStep("Restart tailscaled on a")
tsmpPingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)")
DiscoPingStep := env.AddStep("Ping a → b Disco (want Direct)")
env.Start()
cutControlStep.Begin()
a.DropControlTraffic()
env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) {
if env.ControlServer().Node(nk).Name == a.Name() {
panic(fmt.Sprintf("got connection from %v", a.Name()))
}
})
cutControlStep.End(nil)
restartStep.Begin()
env.RestartTailscaled(a)
restartStep.End(nil)
tsmpPingStep.Begin()
if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil {
tsmpPingStep.End(err)
t.Fatal(err)
}
tsmpPingStep.End(nil)
DiscoPingStep.Begin()
if err := env.PingExpect(a, b, vmtest.PingRouteDirect, 30*time.Second); err != nil {
DiscoPingStep.End(err)
t.Fatal(err)
}
DiscoPingStep.End(nil)
}