// Copyright (c) Tailscale Inc & contributors // SPDX-License-Identifier: BSD-3-Clause package vmtest_test import ( "bytes" "context" "fmt" "net/netip" "runtime" "strings" "testing" "time" "tailscale.com/client/local" "tailscale.com/ipn" "tailscale.com/net/udprelay/status" "tailscale.com/tailcfg" "tailscale.com/tstest" "tailscale.com/tstest/integration/testcontrol" "tailscale.com/tstest/natlab/vmtest" "tailscale.com/tstest/natlab/vnet" "tailscale.com/types/key" "tailscale.com/types/netmap" "tailscale.com/util/set" ) // skipIfNotMacOSArm64 skips the test when the host isn't a macOS arm64 host. // macOS VM tests require Apple Virtualization.framework via tailmac. // AddNode also enforces this when a macOS node is added, but having an // explicit skip at the top of macOS-only tests makes the requirement // obvious to readers. func skipIfNotMacOSArm64(t *testing.T) { t.Helper() if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" { t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH) } } func TestMacOSAndLinuxCanPing(t *testing.T) { skipIfNotMacOSArm64(t) env := vmtest.New(t) lan := env.AddNetwork("192.168.1.1/24") linux := env.AddNode("linux", lan, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet()) macos := env.AddNode("macos", lan, vmtest.OS(vmtest.MacOS), vmtest.DontJoinTailnet()) env.Start() env.LANPing(linux, macos.LanIP(lan)) } func TestTwoMacOSVMsCanPing(t *testing.T) { skipIfNotMacOSArm64(t) env := vmtest.New(t) lan := env.AddNetwork("192.168.1.1/24") mac1 := env.AddNode("mac1", lan, vmtest.OS(vmtest.MacOS), vmtest.DontJoinTailnet()) mac2 := env.AddNode("mac2", lan, vmtest.OS(vmtest.MacOS), vmtest.DontJoinTailnet()) env.Start() // Both macOS VMs have TTA. Ping from mac1 to mac2 and vice versa. env.LANPing(mac1, mac2.LanIP(lan)) env.LANPing(mac2, mac1.LanIP(lan)) } func TestSubnetRouter(t *testing.T) { testSubnetRouterForOS(t, vmtest.Ubuntu2404) } func TestSubnetRouterFreeBSD(t *testing.T) { testSubnetRouterForOS(t, vmtest.FreeBSD150) } func testSubnetRouterForOS(t testing.TB, srOS vmtest.OSImage) { t.Helper() env := vmtest.New(t) clientNet := env.AddNetwork("2.1.1.1", "192.168.1.1/24", "2000:1::1/64", vnet.EasyNAT) internalNet := env.AddNetwork("10.0.0.1/24", "2000:2::1/64") client := env.AddNode("client", clientNet, vmtest.OS(vmtest.Gokrazy)) sr := env.AddNode("subnet-router", clientNet, internalNet, vmtest.OS(srOS), vmtest.AdvertiseRoutes("10.0.0.0/24")) backend := env.AddNode("backend", internalNet, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) // Declare test-specific steps for the web UI. approveStep := env.AddStep("Approve subnet routes") httpStep := env.AddStep("HTTP GET through subnet router") env.Start() approveStep.Begin() env.ApproveRoutes(sr, "10.0.0.0/24") approveStep.End(nil) httpStep.Begin() body := env.HTTPGet(client, fmt.Sprintf("http://%s:8080/", backend.LanIP(internalNet))) if !strings.Contains(body, "Hello world I am backend") { httpStep.Fatalf("got %q", body) } httpStep.End(nil) } func TestSiteToSite(t *testing.T) { testSiteToSite(t, vmtest.Ubuntu2404) } // testSiteToSite runs a site-to-site subnet routing test with // --snat-subnet-routes=false, verifying that original source IPs are preserved // across Tailscale subnet routes. // // Topology: // // Site A: backend-a (10.1.0.0/24) ← → sr-a (WAN + LAN-A) // Site B: backend-b (10.2.0.0/24) ← → sr-b (WAN + LAN-B) // // Both subnet routers are on Tailscale with --snat-subnet-routes=false. // The test sends HTTP from backend-a to backend-b through the subnet routers // and verifies that backend-b sees backend-a's LAN IP (not the subnet router's). func testSiteToSite(t *testing.T, srOS vmtest.OSImage) { env := vmtest.New(t) // WAN networks for each site (each behind NAT). wanA := env.AddNetwork("2.1.1.1", "192.168.1.1/24", vnet.EasyNAT) wanB := env.AddNetwork("3.1.1.1", "192.168.2.1/24", vnet.EasyNAT) // Internal LAN for each site. lanA := env.AddNetwork("10.1.0.1/24") lanB := env.AddNetwork("10.2.0.1/24") // Subnet routers: each on its WAN + LAN, advertising the local LAN, // with SNAT disabled to preserve source IPs. srA := env.AddNode("sr-a", wanA, lanA, vmtest.OS(srOS), vmtest.AdvertiseRoutes("10.1.0.0/24"), vmtest.SNATSubnetRoutes(false)) srB := env.AddNode("sr-b", wanB, lanB, vmtest.OS(srOS), vmtest.AdvertiseRoutes("10.2.0.0/24"), vmtest.SNATSubnetRoutes(false)) // Backend servers on each site's LAN (not on Tailscale). // Use Ubuntu so we can SSH in to add static routes. backendA := env.AddNode("backend-a", lanA, vmtest.OS(vmtest.Ubuntu2404), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) backendB := env.AddNode("backend-b", lanB, vmtest.OS(vmtest.Ubuntu2404), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) // Declare test-specific steps for the web UI. approveStep := env.AddStep("Approve subnet routes (sr-a, sr-b)") staticRouteStep := env.AddStep("Add static routes on backends") httpStep := env.AddStep("HTTP GET through site-to-site") env.Start() approveStep.Begin() env.ApproveRoutes(srA, "10.1.0.0/24") env.ApproveRoutes(srB, "10.2.0.0/24") approveStep.End(nil) // Add static routes on the backends so that traffic to the remote site's // subnet goes through the local subnet router. This mirrors how a real // site-to-site deployment is configured. srALanIP := srA.LanIP(lanA).String() srBLanIP := srB.LanIP(lanB).String() t.Logf("sr-a LAN IP: %s, sr-b LAN IP: %s", srALanIP, srBLanIP) t.Logf("backend-a LAN IP: %s, backend-b LAN IP: %s", backendA.LanIP(lanA), backendB.LanIP(lanB)) staticRouteStep.Begin() env.AddRoute(backendA, "10.2.0.0/24", srALanIP) env.AddRoute(backendB, "10.1.0.0/24", srBLanIP) staticRouteStep.End(nil) // Make an HTTP request from backend-a to backend-b through the subnet routers. // TTA's /http-get falls back to direct dial on non-Tailscale nodes. httpStep.Begin() backendBIP := backendB.LanIP(lanB) body := env.HTTPGet(backendA, fmt.Sprintf("http://%s:8080/", backendBIP)) t.Logf("response: %s", body) if !strings.Contains(body, "Hello world I am backend-b") { httpStep.Fatalf("expected response from backend-b, got %q", body) } // Verify the source IP was preserved. With --snat-subnet-routes=false, // backend-b should see backend-a's LAN IP as the source, not sr-b's LAN IP. backendAIP := backendA.LanIP(lanA).String() if !strings.Contains(body, "from "+backendAIP) { httpStep.Fatalf("source IP not preserved: expected %q in response, got %q", backendAIP, body) } httpStep.End(nil) } // TestInterNetworkTCP verifies that vnet routes raw TCP between simulated // networks: a non-Tailscale VM on one NAT'd LAN can reach a webserver on a // different network using a 1:1 NAT, and the webserver sees the client's // network's WAN IP as the source (post-NAT). func TestInterNetworkTCP(t *testing.T) { env := vmtest.New(t) const ( clientWAN = "1.0.0.1" webWAN = "5.0.0.1" ) clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT) webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT) client := env.AddNode("client", clientNet, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet()) env.AddNode("webserver", webNet, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) // Declare test-specific steps for the web UI. httpStep := env.AddStep("HTTP GET across networks via NAT") env.Start() httpStep.Begin() body := env.HTTPGet(client, fmt.Sprintf("http://%s:8080/", webWAN)) t.Logf("response: %s", body) if !strings.Contains(body, "Hello world I am webserver") { httpStep.Fatalf("unexpected response: %q", body) } if !strings.Contains(body, "from "+clientWAN) { httpStep.Fatalf("expected source %q in response, got %q", clientWAN, body) } httpStep.End(nil) } // TestSubnetRouterPublicIP verifies that toggling --accept-routes on the // client switches between dialing a webserver directly and routing through a // subnet router that advertises the webserver's public IP range. // // Topology: client, subnet router, and webserver each live behind their own // NAT'd network with distinct WAN IPs; the subnet router advertises the // webserver's network as a route. The webserver echoes the source IP it // sees: // - accept-routes=off: client dials webserver directly; source is client's WAN. // - accept-routes=on: client tunnels to the subnet router, which forwards // and SNATs; source is subnet router's WAN. func TestSubnetRouterPublicIP(t *testing.T) { env := vmtest.New(t) const ( clientWAN = "1.0.0.1" routerWAN = "2.0.0.1" webWAN = "5.0.0.1" webRoute = "5.0.0.0/24" ) clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT) routerNet := env.AddNetwork(routerWAN, "192.168.2.1/24", vnet.EasyNAT) webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT) client := env.AddNode("client", clientNet, vmtest.OS(vmtest.Gokrazy)) sr := env.AddNode("subnet-router", routerNet, vmtest.OS(vmtest.Gokrazy), vmtest.AdvertiseRoutes(webRoute)) env.AddNode("webserver", webNet, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) // Declare test-specific steps for the web UI. approveStep := env.AddStep("Approve subnet route (public IP)") checkOn1Step := env.AddStep("HTTP GET (accept-routes=on)") checkOffStep := env.AddStep("HTTP GET (accept-routes=off)") checkOn2Step := env.AddStep("HTTP GET (accept-routes=on, again)") env.Start() // ApproveRoutes also turns on RouteAll on the client. approveStep.Begin() env.ApproveRoutes(sr, webRoute) approveStep.End(nil) webURL := fmt.Sprintf("http://%s:8080/", webWAN) check := func(step *vmtest.Step, label, wantSrc string) { t.Helper() step.Begin() body := env.HTTPGet(client, webURL) t.Logf("[%s] response: %s", label, body) if !strings.Contains(body, "Hello world I am webserver") { step.Fatalf("[%s] unexpected webserver response: %q", label, body) } if !strings.Contains(body, "from "+wantSrc) { step.Fatalf("[%s] expected source %q in response, got %q", label, wantSrc, body) } step.End(nil) } // accept-routes=on (set by ApproveRoutes): traffic flows via the subnet router. check(checkOn1Step, "accept-routes=on", routerWAN) // accept-routes=off: client dials the webserver directly. env.SetAcceptRoutes(client, false) check(checkOffStep, "accept-routes=off", clientWAN) // Toggle back on to confirm the transition works in both directions. env.SetAcceptRoutes(client, true) check(checkOn2Step, "accept-routes=on (again)", routerWAN) } // TestSubnetRouterAndExitNode checks how the subnet router and exit node // preferences interact. Topology: client, subnet router, exit node, and // webserver, each on its own NAT'd network with distinct WAN IPs. The subnet // router advertises the webserver's network (5.0.0.0/24); the exit node // advertises 0.0.0.0/0 + ::/0. The webserver echoes the source IP it sees: // // exit=off, subnet=off → client's WAN (direct dial) // exit=off, subnet=on → subnet router's WAN // exit=on, subnet=off → exit node's WAN // exit=on, subnet=on → subnet router's WAN (more-specific /24 beats /0) func TestSubnetRouterAndExitNode(t *testing.T) { env := vmtest.New(t) const ( clientWAN = "1.0.0.1" routerWAN = "2.0.0.1" exitWAN = "3.0.0.1" webWAN = "5.0.0.1" webRoute = "5.0.0.0/24" ) clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT) routerNet := env.AddNetwork(routerWAN, "192.168.2.1/24", vnet.EasyNAT) exitNet := env.AddNetwork(exitWAN, "192.168.3.1/24", vnet.EasyNAT) webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT) client := env.AddNode("client", clientNet, vmtest.OS(vmtest.Gokrazy)) sr := env.AddNode("subnet-router", routerNet, vmtest.OS(vmtest.Gokrazy), vmtest.AdvertiseRoutes(webRoute)) exit := env.AddNode("exit", exitNet, vmtest.OS(vmtest.Gokrazy), vmtest.AdvertiseRoutes("0.0.0.0/0,::/0")) env.AddNode("webserver", webNet, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) // Declare test-specific steps for the web UI. approveStep := env.AddStep("Approve subnet & exit routes") webURL := fmt.Sprintf("http://%s:8080/", webWAN) tests := []struct { name string // subtest name; describes (exit, subnet) toggles exit *vmtest.Node subnet bool wantSrc string step *vmtest.Step }{ {"exit-off,subnet-off", nil, false, clientWAN, nil}, {"exit-off,subnet-on", nil, true, routerWAN, nil}, {"exit-on,subnet-off", exit, false, exitWAN, nil}, // More-specific 5.0.0.0/24 from sr beats 0.0.0.0/0 from exit. {"exit-on,subnet-on", exit, true, routerWAN, nil}, } for i := range tests { tests[i].step = env.AddStep("HTTP GET: " + tests[i].name) } env.Start() approveStep.Begin() env.ApproveRoutes(sr, webRoute) env.ApproveRoutes(exit, "0.0.0.0/0", "::/0") // Don't let the exit node itself forward via the subnet router: when the // client is using the exit node only, we want the exit node to egress to // the simulated internet directly so the webserver sees the exit's WAN. env.SetAcceptRoutes(exit, false) approveStep.End(nil) for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { tc.step.Begin() env.SetExitNode(client, tc.exit) env.SetAcceptRoutes(client, tc.subnet) body := env.HTTPGet(client, webURL) t.Logf("response: %s", body) if !strings.Contains(body, "Hello world I am webserver") { tc.step.End(fmt.Errorf("unexpected webserver response: %q", body)) t.Fatalf("unexpected webserver response: %q", body) } if !strings.Contains(body, "from "+tc.wantSrc) { tc.step.End(fmt.Errorf("expected source %q in response, got %q", tc.wantSrc, body)) t.Fatalf("expected source %q in response, got %q", tc.wantSrc, body) } tc.step.End(nil) }) } } // TestTaildrop verifies that one Ubuntu node can send a file to another // Ubuntu node via Taildrop, and the receiver gets the same content. // // Topology: two Ubuntu nodes, each behind its own EasyNAT, both joined to the // tailnet. The sender runs `tailscale file cp` to push to the receiver's // Tailscale IP; the receiver then runs `tailscale file get --wait` to fetch // it. func TestTaildrop(t *testing.T) { env := vmtest.New(t, vmtest.SameTailnetUser()) senderNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.EasyNAT) receiverNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.EasyNAT) sender := env.AddNode("sender", senderNet, vmtest.OS(vmtest.Ubuntu2404)) receiver := env.AddNode("receiver", receiverNet, vmtest.OS(vmtest.Ubuntu2404)) // Declare test-specific steps for the web UI. sendStep := env.AddStep("Taildrop send (sender -> receiver)") recvStep := env.AddStep("Taildrop receive (on receiver)") verifyStep := env.AddStep("Verify received name and contents") env.Start() const filename = "hello.txt" want := []byte("hello world this is a Taildrop test\n") sendStep.Begin() env.SendTaildropFile(sender, receiver, filename, want) sendStep.End(nil) recvStep.Begin() gotName, gotContent := env.RecvTaildropFile(t.Context(), receiver) recvStep.End(nil) verifyStep.Begin() if gotName != filename { verifyStep.Fatalf("received name = %q; want %q", gotName, filename) return } if !bytes.Equal(gotContent, want) { verifyStep.Fatalf("received content = %q; want %q", gotContent, want) return } verifyStep.End(nil) } // TestExitNode verifies that switching the client's exit node setting between // off, exit1, and exit2 correctly routes the client's internet traffic. // // Topology: each of the client and the two exit nodes lives behind its own NAT // with a unique WAN IP, and a webserver lives on yet another network using a // 1:1 NAT so it's reachable from the simulated internet at a stable address. // The webserver echoes the source IP of incoming requests, so we can tell // which network's NAT the client's traffic egressed through: // - off: source is the client's network WAN IP. // - exit1: source is exit1's network WAN IP. // - exit2: source is exit2's network WAN IP. func TestExitNode(t *testing.T) { env := vmtest.New(t) const ( clientWAN = "1.0.0.1" exit1WAN = "2.0.0.1" exit2WAN = "3.0.0.1" webWAN = "5.0.0.1" ) clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT) exit1Net := env.AddNetwork(exit1WAN, "192.168.2.1/24", vnet.EasyNAT) exit2Net := env.AddNetwork(exit2WAN, "192.168.3.1/24", vnet.EasyNAT) webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT) client := env.AddNode("client", clientNet, vmtest.OS(vmtest.Gokrazy)) exit1 := env.AddNode("exit1", exit1Net, vmtest.OS(vmtest.Gokrazy), vmtest.AdvertiseRoutes("0.0.0.0/0,::/0")) exit2 := env.AddNode("exit2", exit2Net, vmtest.OS(vmtest.Gokrazy), vmtest.AdvertiseRoutes("0.0.0.0/0,::/0")) env.AddNode("webserver", webNet, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) // Declare test-specific steps for the web UI. approveStep := env.AddStep("Approve exit-node routes (exit1, exit2)") webURL := fmt.Sprintf("http://%s:8080/", webWAN) tests := []struct { name string // subtest name exit *vmtest.Node wantSrc string step *vmtest.Step }{ {"off", nil, clientWAN, nil}, {"exit1", exit1, exit1WAN, nil}, {"exit2", exit2, exit2WAN, nil}, } for i := range tests { tests[i].step = env.AddStep("HTTP GET: exit=" + tests[i].name) } env.Start() approveStep.Begin() env.ApproveRoutes(exit1, "0.0.0.0/0", "::/0") env.ApproveRoutes(exit2, "0.0.0.0/0", "::/0") approveStep.End(nil) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { tt.step.Begin() env.SetExitNode(client, tt.exit) body := env.HTTPGet(client, webURL) t.Logf("response: %s", body) if !strings.Contains(body, "Hello world I am webserver") { tt.step.Fatalf("unexpected webserver response: %q", body) } if !strings.Contains(body, "from "+tt.wantSrc) { tt.step.Fatalf("expected source %q in response, got %q", tt.wantSrc, body) } tt.step.End(nil) }) } } // TestDiscoKeyChange verifies that when one node's disco key rotates without // its WireGuard node key changing, peers detect the change, tear down stale // WireGuard session state for that peer, and re-establish the tunnel in both // directions. This exercises the disco-key-change handling that the // bradfitz/rm_lazy_wg branch relies on for traffic to and from a peer whose // magicsock state has been reset. // // Topology: two gokrazy nodes A and B, each on its own One2OneNAT network so // every connection between them is a direct UDP path with no port-mapping or // filtering. With NAT effects out of the way, what we measure here is the // speed of disco-key-change reconciliation in wgengine/magicsock alone. The // test control server is also configured with [testcontrol.Server.AllOnline] // (via [vmtest.AllOnline]) so the controlclient/wgengine fast paths that // branch on Online actually fire — without that flag the test exercises // only the offline-peer code paths, which mask separate latent issues and // are several seconds slower. // // The test runs four B-side rotations followed by a TSMP ping in the // requested direction: // // rotate (LocalAPI rotate-disco-key) → ping B → A // rotate (LocalAPI rotate-disco-key) → ping A → B // restart (SIGKILL tailscaled) → ping B → A // restart (SIGKILL tailscaled) → ping A → B // // Plus an initial A→B TSMP ping with a generous 30s budget to bring up the // WireGuard tunnel before the rotations begin (so the post-rotation pings // measure stale-state recovery, not first-time setup). All pings are TSMP // because TSMP traverses the actual WireGuard data plane; PingDisco only // exercises the magicsock disco layer and would mask any stale WG session // problems. // // Two rotation methods are exercised: // // - LocalAPI rotate-disco-key (debug action): rolls B's magicsock disco // private key in place, then bounces WantRunning to force wgengine to // drop wireguard-go session keys for every peer (RotateDiscoKey alone // only touches local disco state; without the WantRunning bounce, B // keeps using stale per-peer session keys against A and A drops // everything until B's WG rekey timer eventually fires). // - SIGKILL of tailscaled (via TTA's /kill-tailscaled): the gokrazy // supervisor respawns tailscaled, fully resetting B's magicsock and // wgengine state in addition to rotating the disco key. // // Each post-rotation ping currently gets a 15-second budget. On a // hypothetical perfect build it should take well under a second. In // practice today there are two unavoidable multi-second waits: // // - The rotate-then-a→b phase on main takes ~10s for LazyWG. After // B's WantRunning bounce, B's wgengine resets its sentActivityAt/ // recvActivityAt maps and trims A out of the wireguard-go config // as an "idle peer"; B only re-adds A on inbound activity, by // which point A's first few TSMP packets have been silently // dropped at B's tundev. The bradfitz/rm_lazy_wg branch removes // that trimming entirely (verified locally), so this phase will // drop to <100ms once that branch lands. // // - The restart phases take ~5s for the wireguard-go handshake retry // timer. After SIGKILL+respawn the first WG handshake init from // the restarted node sometimes goes into the void (likely the // brief peer-removed window in the receiver's two-step // [wgengine.userspaceEngine.maybeReconfigWireguardLocked] reconfig // during which the peer is absent from wireguard-go), and wg-go's // [device.RekeyTimeout] of 5s + jitter is the next opportunity to // retry. That retry succeeds and the staged TSMP packet flushes. // This is intrinsic to the protocol's retransmit policy. // // Once LazyWG is removed and the first-handshake-after-reconfig race // is fixed, this budget should be tightened to 5s (or less). // // All four rotations also assert that B's WireGuard node key is unchanged. func TestDiscoKeyChange(t *testing.T) { // AllOnline makes the test control server mark every peer as Online=true // in its MapResponses. Several disco-key handling fast paths // (controlclient.removeUnwantedDiscoUpdates, // removeUnwantedDiscoUpdatesFromFullNetmapUpdate, and the wgengine // tsmpLearnedDisco fast path) only fire for online peers. Production // control servers always populate Online; without this flag the test // would only exercise the offline-peer paths. env := vmtest.New(t, vmtest.AllOnline()) // One2OneNAT so each node has a 1:1 mapping to a public WAN IP with no // port-translation or address-port filtering. This makes A↔B traffic // behave like two unfirewalled hosts on the public internet, so any // slowness we observe in this test cannot be blamed on NAT traversal. aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.One2OneNAT) bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.One2OneNAT) a := env.AddNode("a", aNet, vmtest.OS(vmtest.Gokrazy)) b := env.AddNode("b", bNet, vmtest.OS(vmtest.Gokrazy)) type phase struct { name string rotate func() pingFrom *vmtest.Node pingTo *vmtest.Node applyStep *vmtest.Step verify *vmtest.Step wait *vmtest.Step ping *vmtest.Step } phases := []*phase{ {name: "rotate (LocalAPI), b → a", pingFrom: b, pingTo: a, rotate: func() { env.RotateDiscoKey(b) }}, {name: "rotate (LocalAPI), a → b", pingFrom: a, pingTo: b, rotate: func() { env.RotateDiscoKey(b) }}, {name: "restart, b → a", pingFrom: b, pingTo: a, rotate: func() { env.RestartTailscaled(b) }}, {name: "restart, a → b", pingFrom: a, pingTo: b, rotate: func() { env.RestartTailscaled(b) }}, } pingABStep := env.AddStep("Ping a → b TSMP (establish tunnel)") for _, p := range phases { p.applyStep = env.AddStep("Apply: " + p.name) p.verify = env.AddStep("Verify b: same node key, new disco key (" + p.name + ")") p.wait = env.AddStep("Wait for a to see b's new disco key (" + p.name + ")") p.ping = env.AddStep("Ping " + p.pingFrom.Name() + " → " + p.pingTo.Name() + " TSMP (" + p.name + ")") } env.Start() pingABStep.Begin() if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil { pingABStep.Fatal(err) } pingABStep.End(nil) bStInitial := env.Status(b) bNodeKey := bStInitial.Self.PublicKey cs := env.ControlServer() bCtlNode := cs.Node(bNodeKey) if bCtlNode == nil { t.Fatalf("control server has no node for b's key %v", bNodeKey) } prevDisco := bCtlNode.DiscoKey if prevDisco.IsZero() { t.Fatalf("control server has no disco key for b before rotation") } t.Logf("[b] initial: nodekey=%s discokey=%s", bNodeKey.ShortString(), prevDisco.ShortString()) for _, p := range phases { p.applyStep.Begin() p.rotate() p.applyStep.End(nil) prevDisco = checkDiscoRotated(t, env, a, b, p.pingFrom, p.pingTo, bNodeKey, prevDisco, p.name, p.verify, p.wait, p.ping) } } // checkDiscoRotated verifies that after some action that should have rotated // b's disco key, control has learned the new key, b's node key is unchanged, // a's local view picks up the new disco key, and pingFrom can ping pingTo // (TSMP) within the budget. It returns b's new disco key and fatals on // failure. // // The TSMP ping budget is 15 seconds rather than the few hundred ms it // ought to take. See the top-level test docstring for a full breakdown: // it has to absorb LazyWG's trim+re-add for the rotate-a→b phase (~10s) // and wireguard-go's RekeyTimeout retry for the SIGKILL+restart phases // (~5s). Tighten this once both are addressed. func checkDiscoRotated(t *testing.T, env *vmtest.Env, a, b, pingFrom, pingTo *vmtest.Node, bNodeKey key.NodePublic, oldDisco key.DiscoPublic, label string, verifyStep, waitStep, pingStep *vmtest.Step) key.DiscoPublic { t.Helper() cs := env.ControlServer() verifyStep.Begin() bSt := env.Status(b) if got := bSt.Self.PublicKey; got != bNodeKey { verifyStep.Fatalf("[%s] b's node key changed: %v -> %v", label, bNodeKey, got) } var newDisco key.DiscoPublic if err := tstest.WaitFor(15*time.Second, func() error { n := cs.Node(bNodeKey) if n == nil { return fmt.Errorf("control server has no node for b") } if n.DiscoKey.IsZero() || n.DiscoKey == oldDisco { return fmt.Errorf("control still has old disco key %v for b", n.DiscoKey) } newDisco = n.DiscoKey return nil }); err != nil { verifyStep.Fatalf("[%s] %v", label, err) } t.Logf("[b] after %s: nodekey=%s discokey=%s", label, bNodeKey.ShortString(), newDisco.ShortString()) verifyStep.End(nil) waitStep.Begin() if err := tstest.WaitFor(30*time.Second, func() error { d, ok, err := env.PeerDiscoKey(a, bNodeKey) if err != nil { return err } if !ok { return fmt.Errorf("a doesn't yet have b in its status") } if d != newDisco { return fmt.Errorf("a still sees b's old disco %v, want %v", d.ShortString(), newDisco.ShortString()) } return nil }); err != nil { waitStep.End(err) env.DumpStatus(a) t.Fatalf("[%s] %v", label, err) } waitStep.End(nil) pingStep.Begin() t0 := time.Now() if err := env.Ping(pingFrom, pingTo, tailcfg.PingTSMP, 15*time.Second); err != nil { pingStep.End(err) env.DumpStatus(a) env.DumpStatus(b) t.Fatalf("[%s] %v", label, err) } t.Logf("[%s] ping %s -> %s succeeded in %v", label, pingFrom.Name(), pingTo.Name(), time.Since(t0).Round(100*time.Millisecond)) pingStep.End(nil) return newDisco } // TestMullvadExitNode verifies that a Tailscale client whose netmap contains // a plain-WireGuard exit node (the way Mullvad exit nodes are wired up by // the control plane) can route internet traffic through it, with the source // IP rewritten to the per-client Mullvad-assigned address. // // Topology: // // client (Tailscale, gokrazy) — clientNet (EasyNAT) WAN 1.0.0.1 // mullvad (Ubuntu, userspace WG) — mullvadNet (One2OneNAT) WAN 2.0.0.1 // webserver (no Tailscale, gokrazy) — webNet (One2OneNAT) WAN 5.0.0.1 // // The mullvad VM impersonates a Mullvad WireGuard server. After boot, the // test asks its TTA agent to bring up a userspace WireGuard interface (a // real Linux TUN driven by wireguard-go) that pins the client's Tailscale // node public key as its only allowed peer, sets up IP-forwarding + a // MASQUERADE rule, and reports the WG server's freshly generated public // key back. Userspace vs kernel WireGuard makes no difference on the wire // — what's being tested is Tailscale's plain-WireGuard exit-node code // path, not the kernel module. // // The test then injects a netmap peer with IsWireGuardOnly=true, // AllowedIPs=[gw/32, 0.0.0.0/0, ::/0], the WG endpoint, and a per-client // SelfNodeV4MasqAddrForThisPeer (the mock equivalent of the per-client IP // Mullvad's API hands out at registration time). // // The webserver echoes the source IP it sees: // - exit-node off: source is client's WAN (direct egress) // - exit-node on: source is mullvad's WAN (egress via WG + MASQUERADE) func TestMullvadExitNode(t *testing.T) { env := vmtest.New(t) const ( clientWAN = "1.0.0.1" mullvadWAN = "2.0.0.1" webWAN = "5.0.0.1" ) // Mullvad-side WG network. The client appears as clientMasqIP to // mullvad's wg0; mullvad terminates the tunnel at gw. var ( mullvadWGNet = netip.MustParsePrefix("10.64.0.0/24") gw = netip.MustParsePrefix("10.64.0.1/24") clientMasq = netip.MustParsePrefix("10.64.0.2/32") ) const wgListenPort uint16 = 51820 clientNet := env.AddNetwork(clientWAN, "192.168.1.1/24", vnet.EasyNAT) mullvadNet := env.AddNetwork(mullvadWAN, "192.168.2.1/24", vnet.One2OneNAT) webNet := env.AddNetwork(webWAN, "192.168.5.1/24", vnet.One2OneNAT) client := env.AddNode("client", clientNet, vmtest.OS(vmtest.Gokrazy)) mullvad := env.AddNode("mullvad", mullvadNet, vmtest.OS(vmtest.Ubuntu2404), vmtest.DontJoinTailnet()) env.AddNode("webserver", webNet, vmtest.OS(vmtest.Gokrazy), vmtest.DontJoinTailnet(), vmtest.WebServer(8080)) // Declare test-specific steps for the web UI. wgUpStep := env.AddStep("Bring up Mullvad WG server") injectStep := env.AddStep("Inject Mullvad netmap peer") checkOff1Step := env.AddStep("HTTP GET (exit off)") checkMullvadStep := env.AddStep("HTTP GET (exit=mullvad)") checkOff2Step := env.AddStep("HTTP GET (exit off, again)") env.Start() // Bring up the WG server inside mullvad's TTA, pinning the client's // Tailscale node public key as the sole allowed peer. wgUpStep.Begin() clientStatus := env.Status(client) mullvadPub := env.BringUpMullvadWGServer(mullvad, gw, wgListenPort, clientStatus.Self.PublicKey, clientMasq, mullvadWGNet) wgUpStep.End(nil) // Inject the mullvad node into the netmap as a plain-WireGuard exit // node. This mirrors how the control plane describes Mullvad exit // nodes to clients (see control/cmullvad in the closed repo): a // peer with IsWireGuardOnly=true, an Endpoints entry pointing at // the public WG host:port, and AllowedIPs covering both the gateway // /32 and the 0.0.0.0/0+::/0 exit-node routes. injectStep.Begin() mullvadEndpoint := netip.AddrPortFrom(netip.MustParseAddr(mullvadWAN), wgListenPort) gwHost := netip.PrefixFrom(gw.Addr(), gw.Addr().BitLen()) mullvadNode := &tailcfg.Node{ ID: 999_001, StableID: "mullvad-test", Name: "mullvad-test.fake-control.example.net.", Key: mullvadPub, MachineAuthorized: true, IsWireGuardOnly: true, Endpoints: []netip.AddrPort{mullvadEndpoint}, Addresses: []netip.Prefix{gwHost}, AllowedIPs: []netip.Prefix{ gwHost, netip.MustParsePrefix("0.0.0.0/0"), netip.MustParsePrefix("::/0"), }, Hostinfo: (&tailcfg.Hostinfo{ Hostname: "mullvad-test", }).View(), } cs := env.ControlServer() cs.UpdateNode(mullvadNode) // Set the per-peer source-IP masquerade. The control plane normally // derives this from the Mullvad API's per-client registration; here // we just pin it to the address mullvad's wg0 was told to accept. cs.SetMasqueradeAddresses([]testcontrol.MasqueradePair{{ Node: clientStatus.Self.PublicKey, Peer: mullvadPub, NodeMasqueradesAs: clientMasq.Addr(), }}) injectStep.End(nil) webURL := fmt.Sprintf("http://%s:8080/", webWAN) check := func(step *vmtest.Step, label, wantSrc string) { t.Helper() step.Begin() body := env.HTTPGet(client, webURL) t.Logf("[%s] response: %s", label, body) if !strings.Contains(body, "Hello world I am webserver") { step.Fatalf("[%s] unexpected webserver response: %q", label, body) } if !strings.Contains(body, "from "+wantSrc) { step.Fatalf("[%s] expected source %q in response, got %q", label, wantSrc, body) } step.End(nil) } // Exit-node off: client routes 0.0.0.0/0 directly via its host stack, // so the webserver sees client's WAN IP. check(checkOff1Step, "exit-off", clientWAN) // Switch to the Mullvad WG-only peer as exit node. The client should // now route 0.0.0.0/0 through the WG tunnel; mullvad MASQUERADEs to // its WAN; the webserver sees the mullvad VM's WAN IP. env.SetExitNodeIP(client, gw.Addr()) check(checkMullvadStep, "exit-mullvad", mullvadWAN) // And back off again, to make sure the transition works in both // directions. env.SetExitNodeIP(client, netip.Addr{}) check(checkOff2Step, "exit-off (again)", clientWAN) } // checkClientMetrics verifies that each entry in want exists and has the given // value in metrics. func checkClientMetrics(t *testing.T, label string, metrics vmtest.ClientMetrics, want map[string]int64) { t.Helper() for name, wantValue := range want { got, ok := metrics[name] if !ok { t.Errorf("%s: required metric %q not found", label, name) } else if got.Value != wantValue { t.Errorf("%s: metric %q: got %v, want %v", label, name, got.Value, wantValue) } } } // TestCachedNetmapAfterRestart verifies that two nodes with netmap // caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct // WireGuard tunnel after both are restarted while the control server is // unreachable. After restart the nodes must use only their on-disk cached // netmaps to re-connect. func TestCachedNetmapAfterRestart(t *testing.T) { env := vmtest.New(t) aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.EasyNAT) bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.EasyNAT) a := env.AddNode("a", aNet, vmtest.OS(vmtest.Gokrazy), tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil}) b := env.AddNode("b", bNet, vmtest.OS(vmtest.Gokrazy), tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil}) connectStep := env.AddStep("Establish initial TSMP tunnel") cutControlStep := env.AddStep("Cut control server access") restartStep := env.AddStep("Restart tailscaled on both nodes") netmapCheckStep := env.AddStep("Check netmap loaded is cached") pingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)") env.Start() connectStep.Begin() if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil { connectStep.Fatal(err) } connectStep.End(nil) cutControlStep.Begin() // Both nodes lose connection to control a.DropControlTraffic() b.DropControlTraffic() env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) { panic(fmt.Sprintf("got connection from %v", nk)) }) cutControlStep.End(nil) restartStep.Begin() env.RestartTailscaled(a) env.RestartTailscaled(b) restartStep.End(nil) netmapCheckStep.Begin() for _, node := range []*vmtest.Node{a, b} { nm, err := local.GetDebugResultJSON[netmap.NetworkMap](t.Context(), node.Agent().Client, "current-netmap") if err != nil { netmapCheckStep.Fatalf("[%s] got err fetching netmap %q", node.Name(), err) } if !nm.Cached { netmapCheckStep.Fatalf("[%s] expected netmap.Cached = true, got: %t", node.Name(), nm.Cached) } } netmapCheckStep.End(nil) // 90s is generous on purpose. After both nodes restart with stale cached // netmap entries, a's first WG handshake to b's pre-restart endpoint // hits the dead NAT mapping on b's side and is silently dropped (we // see this as "no recent outgoing packet" NAT drops in the vnet log). // Recovery then waits on wireguard-go's REKEY_TIMEOUT (~5s) before the // next handshake attempt, and on disco-via-DERP to teach each side the // other's new endpoint. On an idle host this converges in well under // 15s; on a contended host (a 14/16-CPU-loaded local repro, or any // shared CI runner) the same sequence has been observed at 50-60s // because every timer fires multiple times under scheduling jitter. pingStep.Begin() if err := env.Ping(a, b, tailcfg.PingTSMP, 90*time.Second); err != nil { pingStep.Fatal(err) } pingStep.End(nil) } // TestDirectConnectionWithCachedNetmap verifies that two nodes with netmap // caching enabled (NodeAttrCacheNetworkMaps) can re-establish a direct // WireGuard tunnel after one is restarted while the control server is // unreachable. After restart the node must use only its on-disk cached // netmaps to re-connect and ping the other (still online) node. func TestDirectConnectionWithCachedNetmapOnOneNode(t *testing.T) { env := vmtest.New(t) aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.EasyNAT) bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.EasyNAT) a := env.AddNode("a", aNet, vmtest.OS(vmtest.Gokrazy), tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil}) b := env.AddNode("b", bNet, vmtest.OS(vmtest.Gokrazy), tailcfg.NodeCapMap{tailcfg.NodeAttrCacheNetworkMaps: nil}) checkInitialMetrics := env.AddStep("Check initial client metrics") cutControlStep := env.AddStep("Cut control server access") restartStep := env.AddStep("Restart tailscaled on a") tsmpPingStep := env.AddStep("Ping a → b TSMP (cached netmap, no control)") discoPingStep := env.AddStep("Ping a → b Disco (want Direct)") checkFinalMetrics := env.AddStep("Check final client metrics") env.Start() // Before: Verify that we have not recorded any cached contacts. checkInitialMetrics.Begin() checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{ "magicsock_cached_peer_contact_derp": 0, "magicsock_cached_peer_contact_direct": 0, }) checkInitialMetrics.End(nil) cutControlStep.Begin() a.DropControlTraffic() env.ControlServer().SetOnMapRequest(func(nk key.NodePublic) { if env.ControlServer().Node(nk).Name == a.Name() { panic(fmt.Sprintf("got connection from %v", a.Name())) } }) cutControlStep.End(nil) restartStep.Begin() env.RestartTailscaled(a) restartStep.End(nil) tsmpPingStep.Begin() if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil { tsmpPingStep.Fatal(err) } tsmpPingStep.End(nil) discoPingStep.Begin() if err := env.PingExpect(a, b, vmtest.PingRouteDirect, 30*time.Second); err != nil { discoPingStep.Fatal(err) } discoPingStep.End(nil) // After: Verify that we recorded a direct contact on the disconnected node. checkFinalMetrics.Begin() checkClientMetrics(t, "Node A", env.ClientMetrics(a), map[string]int64{ "magicsock_cached_peer_contact_direct": 1, }) checkFinalMetrics.End(nil) } // TestPeerRelay verifies that two Tailscale nodes whose direct UDP path is // impossible at the network layer (both behind HardNAT, with no port-mapping // services on either of their networks) can still communicate via a third // Tailscale node configured as a peer-relay server. // // Topology: // // a (gokrazy, HardNAT) — aNet WAN 1.0.0.1 // b (gokrazy, HardNAT) — bNet WAN 2.0.0.1 // relay (gokrazy, One2OneNAT) — relayNet WAN 3.0.0.1 // // HardNAT in natlab is endpoint-dependent (each (src, dst) tuple gets a fresh // outbound port, and the inbound table keys on (wanPort, src)). Without // NAT-PMP/UPnP a→b and b→a direct UDP paths cannot be established. The relay // uses One2OneNAT so its STUN-discovered WAN endpoint is reachable from both // peers. The test then asserts that magicsock chose the peer-relay path // (not DERP) and that the relay reports the session. func TestPeerRelay(t *testing.T) { env := vmtest.New(t, vmtest.PeerRelayGrants()) aNet := env.AddNetwork("1.0.0.1", "192.168.1.1/24", vnet.HardNAT) bNet := env.AddNetwork("2.0.0.1", "192.168.2.1/24", vnet.HardNAT) relayNet := env.AddNetwork("3.0.0.1", "192.168.3.1/24", vnet.One2OneNAT) a := env.AddNode("a", aNet, vmtest.OS(vmtest.Gokrazy)) b := env.AddNode("b", bNet, vmtest.OS(vmtest.Gokrazy)) relay := env.AddNode("relay", relayNet, vmtest.OS(vmtest.Gokrazy)) enableRelayStep := env.AddStep("Enable peer-relay server on relay") pingStep := env.AddStep("Disco ping a → b (want peer-relay path)") sessionsStep := env.AddStep("Check DebugPeerRelaySessions on relay") env.Start() // Turn on the relay server. Port 0 picks an unused port. enableRelayStep.Begin() editCtx, editCancel := context.WithTimeout(t.Context(), 30*time.Second) _, err := relay.Agent().EditPrefs(editCtx, &ipn.MaskedPrefs{ Prefs: ipn.Prefs{RelayServerPort: new(uint16(0))}, RelayServerPortSet: true, }) editCancel() if err != nil { enableRelayStep.Fatalf("EditPrefs(relay, RelayServerPort=0): %v", err) } enableRelayStep.End(nil) // Wait for the relay to start, peers to learn about it via netmap, // and the a→b disco ping to traverse it. // PingResult.PeerRelay is set by magicsock to "ip:port:vni:N" when the // disco probe rode a peer relay (vs Endpoint for direct UDP or // DERPRegionID for DERP). pingStep.Begin() bIP := env.Status(b).Self.TailscaleIPs[0] var lastDetail string err = tstest.WaitFor(60*time.Second, func() error { ctx, cancel := context.WithTimeout(t.Context(), 5*time.Second) defer cancel() pr, err := a.Agent().PingWithOpts(ctx, bIP, tailcfg.PingDisco, local.PingOpts{}) if err != nil { return fmt.Errorf("ping: %w", err) } if pr.Err != "" { return fmt.Errorf("ping err: %s", pr.Err) } if pr.PeerRelay == "" { lastDetail = fmt.Sprintf("endpoint=%q derp=%d", pr.Endpoint, pr.DERPRegionID) return fmt.Errorf("ping did not use a peer relay; %s", lastDetail) } t.Logf("a → b disco ping rode peer-relay %s", pr.PeerRelay) return nil }) if err != nil { env.DumpStatus(a) env.DumpStatus(b) env.DumpStatus(relay) pingStep.Fatalf("waiting for peer-relay path a → b: %v (last: %s)", err, lastDetail) } pingStep.End(nil) // The relay's local debug-peer-relay-sessions LocalAPI should now // report a single session for the a↔b disco probe. Cross-check the // session's client disco keys against control's view of a and b, and // confirm both sides recorded non-zero packet/byte counts (the disco // ping + pong each take one underlay packet through the relay). sessionsStep.Begin() ctx, cancel := context.WithTimeout(t.Context(), 5*time.Second) defer cancel() srv, err := relay.Agent().DebugPeerRelaySessions(ctx) if err != nil { sessionsStep.Fatalf("DebugPeerRelaySessions: %v", err) } if srv.UDPPort == nil { sessionsStep.Fatalf("relay UDPPort is nil; want set") } if got, want := len(srv.Sessions), 1; got != want { sessionsStep.Fatalf("relay sessions = %d; want %d: %+v", got, want, srv.Sessions) } cs := env.ControlServer() wantShorts := set.Of( cs.Node(env.Status(a).Self.PublicKey).DiscoKey.ShortString(), cs.Node(env.Status(b).Self.PublicKey).DiscoKey.ShortString(), ) session := srv.Sessions[0] gotShorts := set.Of(session.Client1.ShortDisco, session.Client2.ShortDisco) if !gotShorts.Equal(wantShorts) { sessionsStep.Fatalf("session disco shorts = %v; want %v", gotShorts, wantShorts) } for _, ci := range []status.ClientInfo{session.Client1, session.Client2} { if !ci.Endpoint.IsValid() { sessionsStep.Fatalf("session client %s: invalid Endpoint", ci.ShortDisco) } if ci.PacketsTx == 0 { sessionsStep.Fatalf("session client %s: PacketsTx = 0; want >0", ci.ShortDisco) } if ci.BytesTx == 0 { sessionsStep.Fatalf("session client %s: BytesTx = 0; want >0", ci.ShortDisco) } } t.Logf("relay session VNI=%d %s <-> %s on UDP port %d", session.VNI, session.Client1.ShortDisco, session.Client2.ShortDisco, *srv.UDPPort) sessionsStep.End(nil) }