Files
tailscale/tstest/natlab/vmtest/vmtest.go
T
Brad Fitzpatrick 93440604e0 tstest/natlab/vmtest: add TestPeerRelay
Add a VM-based natlab test that exercises the peer-relay feature
(feature/relayserver) end-to-end across three Tailscale nodes whose
network topology makes a direct A<->B UDP path impossible: both peers
are behind HardNAT (FreeBSD/pfSense-style endpoint-dependent NAT) with
no port-mapping services, while the relay node is behind One2OneNAT so
its STUN-discovered WAN endpoint is reachable from both peers. The
test enables the relay server via EditPrefs, then waits for an a->b
PingDisco whose PingResult.PeerRelay is set (proving magicsock chose
the peer-relay path, not DERP), and finally asserts that the relay's
DebugPeerRelaySessions LocalAPI reports the session.

The existing TestPeerRelayPing in tstest/integration runs three
tailscaled processes on the loopback interface with no NATs; this new
vmtest covers peer relay through real per-VM kernels and NATs.

To wire control-server capabilities into vmtest, also add a
PeerRelayGrants() EnvOption (sibling of AllOnline,
SameTailnetUser) that flips testcontrol.Server.PeerRelayGrants so the
wildcard packet filter grants tailcfg.PeerCapabilityRelay and
PeerCapabilityRelayTarget; without those caps magicsock won't consider
any peer a candidate relay.

Updates #13038

Change-Id: Ib3440b83ec442da0d3b89ffa48ceea9398ea9062
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2026-05-14 14:47:29 -07:00

1841 lines
57 KiB
Go

// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
// Package vmtest provides a high-level framework for running integration tests
// across multiple QEMU virtual machines connected by natlab's vnet virtual
// network infrastructure. It supports mixed OS types (gokrazy, Ubuntu, Debian)
// and multi-NIC configurations for scenarios like subnet routing.
//
// Prerequisites:
// - qemu-system-x86_64 (KVM is used automatically on Linux when /dev/kvm is accessible)
// - A built gokrazy natlabapp image (auto-built on first run via "make natlab" in gokrazy/)
//
// Run tests with:
//
// go test ./tstest/natlab/vmtest/ --run-vm-tests -v
package vmtest
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"flag"
"fmt"
"io"
"net"
"net/http"
"net/netip"
"net/url"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/google/gopacket/layers"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"go4.org/mem"
"golang.org/x/sync/errgroup"
"tailscale.com/client/local"
"tailscale.com/ipn"
"tailscale.com/ipn/ipnstate"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/tstest/integration/testcontrol"
"tailscale.com/tstest/natlab/vnet"
"tailscale.com/types/key"
"tailscale.com/util/mak"
)
var (
runVMTests = flag.Bool("run-vm-tests", false, "run tests that require QEMU VMs")
verboseVMDebug = flag.Bool("verbose-vm-debug", false, "enable verbose debug logging for VM tests")
testVersion = flag.String("test-version", "", `if non-empty, download tailscale & tailscaled at the given release version (e.g. "1.97.255", "unstable", or "stable") instead of building from the source tree`)
)
// Env is a test environment that manages virtual networks and QEMU VMs.
// Create one with New, add networks and nodes, then call Start.
type Env struct {
t testing.TB
cfg vnet.Config
server *vnet.Server
nodes []*Node
tempDir string
sockDir string // short-path dir for Unix sockets (macOS has 104-byte limit)
sockAddr string // shared Unix socket path for all QEMU netdevs
dgramSockAddr string // Unix dgram socket path for macOS VMs (tailmac)
binDir string // directory for compiled binaries
// testVersion is the resolved Tailscale release version to use (empty if
// building from source). When non-empty, tailscale and tailscaled binaries
// are downloaded from pkgs.tailscale.com instead of compiled from the tree.
testVersion string
// gokrazy-specific paths
gokrazyBase string // path to gokrazy base qcow2 image
gokrazyKernel string // path to gokrazy kernel
// tailmac-specific paths (macOS VMs)
tailmacDir string // path to tailmac bin/ directory containing Host.app
macosSnapshot string // path to cached macOS VM snapshot directory
macosSnapshotOnce sync.Once
qemuProcs []*exec.Cmd // launched QEMU processes
sameTailnetUser bool // all nodes register as the same Tailnet user
allOnline bool // mark every peer as Online=true in MapResponses
peerRelayGrants bool // grant peer-relay capabilities on the wildcard packet filter
// Shared resource initialization (sync.Once for things multiple nodes share).
vnetOnce sync.Once
gokrazyOnce sync.Once
qemuSockOnce sync.Once
dgramSockOnce sync.Once
compileMu sync.Mutex
compileOnce map[string]*sync.Once // keyed by goos_goarch
imageOnce map[string]*sync.Once // keyed by OSImage.Name
// Web UI support.
ctx context.Context // cancelled when test ends
eventBus *EventBus
testStatus *TestStatus
stepsMu sync.Mutex
stepsByKey map[string]*Step
steps []*Step
nodeStatusMu sync.Mutex
nodeStatus map[string]*NodeStatus // keyed by node name
}
// logVerbosef logs a message only when --verbose-vm-debug is set.
func (e *Env) logVerbosef(format string, args ...any) {
if *verboseVMDebug {
e.t.Helper()
e.t.Logf(format, args...)
}
}
// vmPlatform defines how a VM type boots. Each OS image type (gokrazy,
// cloud, macOS) implements this interface.
type vmPlatform interface {
// planSteps registers steps with the web UI in a dry-run pass.
planSteps(e *Env, n *Node)
// boot does everything needed to get this node running: ensure images,
// compile binaries, set up sockets, launch VM. Called concurrently.
boot(ctx context.Context, e *Env, n *Node) error
}
// platform returns the vmPlatform for this node's OS type.
func (n *Node) platform() vmPlatform {
if n.os.IsMacOS {
return macPlatform{}
}
if n.os.IsGokrazy {
return gokrazyPlatform{}
}
return qemuCloudPlatform{}
}
// AddStep declares an expected stage of the test. The web UI shows all steps
// from the start, tracking their progress. Call before or during the test.
// Returns a *Step whose Begin/End methods drive the progress display.
func (e *Env) AddStep(name string) *Step {
s := &Step{
name: name,
index: len(e.steps),
env: e,
}
e.steps = append(e.steps, s)
return s
}
// Step returns a step by key, creating it if it doesn't exist.
// Safe for concurrent use. Both planSteps (dry-run) and boot (real-run)
// call this to get the same Step object.
func (e *Env) Step(key string) *Step {
e.stepsMu.Lock()
defer e.stepsMu.Unlock()
if s, ok := e.stepsByKey[key]; ok {
return s
}
s := &Step{
name: key,
index: len(e.steps),
env: e,
}
e.steps = append(e.steps, s)
if e.stepsByKey == nil {
e.stepsByKey = make(map[string]*Step)
}
e.stepsByKey[key] = s
return s
}
// Steps returns all declared steps in order.
func (e *Env) Steps() []*Step {
return e.steps
}
// publishStepChange publishes a step status change event.
func (e *Env) publishStepChange(s *Step) {
e.eventBus.Publish(VMEvent{
Type: EventStepChanged,
Message: fmt.Sprintf("%s %s", s.Status().Icon(), s.name),
Step: s,
})
}
// initNodeStatus initializes the NodeStatus for all nodes. Called after
// AddNode but before Start so the web UI can render them.
func (e *Env) initNodeStatus() {
e.nodeStatusMu.Lock()
defer e.nodeStatusMu.Unlock()
for _, n := range e.nodes {
nics := make([]NICStatus, len(n.nets))
for i := range n.nets {
nics[i] = NICStatus{
NetName: e.nicLabel(n, i),
DHCP: "waiting",
}
}
e.nodeStatus[n.name] = &NodeStatus{
Name: n.name,
OS: n.os.Name,
NICs: nics,
JoinsTailnet: n.joinTailnet,
Tailscale: "--",
}
}
}
// nicLabel returns a short human-readable label for a node's i-th NIC.
// After Start(), we can use the assigned LAN IP. Before that, we use "NIC N".
func (e *Env) nicLabel(n *Node, i int) string {
if n.vnetNode != nil {
ip := n.vnetNode.LanIP(n.nets[i])
if ip.IsValid() {
return ip.String()
}
}
return fmt.Sprintf("NIC %d", i)
}
// getNodeStatus returns the current status for a node.
func (e *Env) getNodeStatus(name string) NodeStatus {
e.nodeStatusMu.Lock()
defer e.nodeStatusMu.Unlock()
ns := e.nodeStatus[name]
if ns == nil {
return NodeStatus{Name: name, Tailscale: "--"}
}
return *ns
}
// setNodeDHCP updates the DHCP status for a specific NIC on a node.
func (e *Env) setNodeDHCP(name string, nicIdx int, status string) {
e.nodeStatusMu.Lock()
ns := e.nodeStatus[name]
if ns != nil && nicIdx < len(ns.NICs) {
ns.NICs[nicIdx].DHCP = status
}
e.nodeStatusMu.Unlock()
}
// setNodeTailscale updates the Tailscale status for a node and publishes
// an event so the web UI updates via WebSocket.
func (e *Env) setNodeTailscale(name, status string) {
e.nodeStatusMu.Lock()
ns := e.nodeStatus[name]
if ns != nil {
ns.Tailscale = status
}
e.nodeStatusMu.Unlock()
e.eventBus.Publish(VMEvent{
NodeName: name,
Type: EventTailscale,
Message: "Tailscale: " + status,
Detail: status,
})
}
// appendConsoleLine adds a line to a node's console buffer.
func (e *Env) appendConsoleLine(name, line string) {
e.nodeStatusMu.Lock()
ns := e.nodeStatus[name]
if ns != nil {
ns.Console = append(ns.Console, line)
if len(ns.Console) > maxConsoleLines {
ns.Console = ns.Console[len(ns.Console)-maxConsoleLines:]
}
}
e.nodeStatusMu.Unlock()
}
// nicIndexForMAC returns the NIC index (0-based) for a given MAC on a node.
// Returns -1 if not found.
func (e *Env) nicIndexForMAC(name string, mac vnet.MAC) int {
for _, n := range e.nodes {
if n.name != name {
continue
}
for i := range n.nets {
if n.vnetNode.NICMac(i) == mac {
return i
}
}
}
return -1
}
// nodeNameByNum returns the node name for a given vnet node number.
func (e *Env) nodeNameByNum(num int) string {
for _, n := range e.nodes {
if n.num == num {
return n.name
}
}
return fmt.Sprintf("node%d", num)
}
// New creates a new test environment. It skips the test if --run-vm-tests is
// not set. opts may contain [EnvOption] values returned by helpers like
// [SameTailnetUser].
func New(t testing.TB, opts ...EnvOption) *Env {
if !*runVMTests {
t.Skip("skipping VM test; set --run-vm-tests to run")
}
tempDir := t.TempDir()
// Unix sockets have a short path limit (104 bytes on macOS). The Go
// test TempDir path easily exceeds that, so create a dedicated short
// directory under /tmp for sockets.
sockDir, err := os.MkdirTemp("", "vmtest")
if err != nil {
t.Fatalf("creating socket tempdir: %v", err)
}
t.Cleanup(func() { os.RemoveAll(sockDir) })
e := &Env{
t: t,
tempDir: tempDir,
sockDir: sockDir,
binDir: filepath.Join(tempDir, "bin"),
eventBus: newEventBus(),
testStatus: newTestStatus(),
nodeStatus: make(map[string]*NodeStatus),
}
for _, o := range opts {
o.applyTo(e)
}
t.Cleanup(func() {
e.testStatus.finish(t.Failed())
e.eventBus.Publish(VMEvent{
Type: EventTestStatus,
Message: e.testStatus.State(),
Detail: formatDuration(e.testStatus.Elapsed()),
})
})
return e
}
// EnvOption configures an [Env] in [New].
type EnvOption interface {
applyTo(*Env)
}
type envOptFunc func(*Env)
func (f envOptFunc) applyTo(e *Env) { f(e) }
// SameTailnetUser returns an [EnvOption] that makes every node register with
// the test control server as the same Tailnet user. This is needed for
// cross-node features that require a same-user relationship — Taildrop, for
// example.
func SameTailnetUser() EnvOption {
return envOptFunc(func(e *Env) { e.sameTailnetUser = true })
}
// AllOnline returns an [EnvOption] that makes the test control server mark
// every peer as Online=true in MapResponses (testcontrol.Server.AllOnline).
// Several disco-key handling fast paths in the controlclient and wgengine
// only fire when the peer is reported online; without this option those
// paths are silently skipped, which can mask bugs and slow down recovery
// from disco-key rotations.
func AllOnline() EnvOption {
return envOptFunc(func(e *Env) { e.allOnline = true })
}
// PeerRelayGrants returns an [EnvOption] that makes the test control server
// grant [tailcfg.PeerCapabilityRelay] and [tailcfg.PeerCapabilityRelayTarget]
// on the wildcard packet filter (testcontrol.Server.PeerRelayGrants). Without
// those capabilities, magicsock does not consider any peer a candidate
// peer-relay server, so a node that has [ipn.Prefs.RelayServerPort] set
// cannot actually be used as a relay by its peers.
func PeerRelayGrants() EnvOption {
return envOptFunc(func(e *Env) { e.peerRelayGrants = true })
}
// AddNetwork creates a new virtual network. Arguments follow the same pattern as
// vnet.Config.AddNetwork (string IPs, NAT types, NetworkService values).
func (e *Env) AddNetwork(opts ...any) *vnet.Network {
return e.cfg.AddNetwork(opts...)
}
// Node represents a virtual machine in the test environment.
type Node struct {
name string
num int // assigned during AddNode
os OSImage
nets []*vnet.Network
vnetNode *vnet.Node // primary vnet node (set during Start)
agent *vnet.NodeAgentClient
joinTailnet bool
noAgent bool // true to skip TTA agent setup (e.g. macOS VMs without TTA)
advertiseRoutes string
snatSubnetRoutes *bool // nil means default (true)
webServerPort int
sshPort int // host port for SSH debug access (cloud VMs only)
}
// AddNode creates a new VM node. The name is used for identification and as the
// webserver greeting. Options can be *vnet.Network (for network attachment),
// NodeOption values, or vnet node options (like vnet.TailscaledEnv).
func (e *Env) AddNode(name string, opts ...any) *Node {
n := &Node{
name: name,
os: Gokrazy, // default
joinTailnet: true,
}
e.nodes = append(e.nodes, n)
// Separate network options from other options.
var vnetOpts []any
for _, o := range opts {
switch o := o.(type) {
case *vnet.Network:
n.nets = append(n.nets, o)
vnetOpts = append(vnetOpts, o)
case nodeOptOS:
n.os = OSImage(o)
case nodeOptNoTailscale:
n.joinTailnet = false
vnetOpts = append(vnetOpts, vnet.DontJoinTailnet)
case nodeOptNoAgent:
n.noAgent = true
case nodeOptAdvertiseRoutes:
n.advertiseRoutes = string(o)
case nodeOptSNATSubnetRoutes:
v := bool(o)
n.snatSubnetRoutes = &v
case nodeOptWebServer:
n.webServerPort = int(o)
default:
// Pass through to vnet (TailscaledEnv, NodeOption, MAC, etc.)
vnetOpts = append(vnetOpts, o)
}
}
// macOS VMs require a macOS arm64 host (Apple Virtualization.framework via
// tailmac). Skip the test now rather than letting it proceed through the
// rest of the setup only to fail later.
if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") {
e.t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH)
}
n.vnetNode = e.cfg.AddNode(vnetOpts...)
n.num = n.vnetNode.Num()
return n
}
// Name returns the name of the Node.
func (n *Node) Name() string {
return n.name
}
// LanIP returns the LAN IPv4 address of this node on the given network.
// This is only valid after Env.Start() has been called.
// Name returns the node's name as set in [Env.AddNode].
func (n *Node) LanIP(net *vnet.Network) netip.Addr {
return n.vnetNode.LanIP(net)
}
// DropControlTraffic sets up a blackhole for control traffic for just this
// node on all the networks belonging to the node.
func (n *Node) DropControlTraffic() {
for _, network := range n.nets {
network.BlackholeControlForAddr(n.LanIP(network))
}
}
// NodeOption types for configuring nodes.
type nodeOptOS OSImage
type nodeOptNoTailscale struct{}
type nodeOptNoAgent struct{}
type nodeOptAdvertiseRoutes string
type nodeOptSNATSubnetRoutes bool
type nodeOptWebServer int
// OS returns a NodeOption that sets the node's operating system image.
func OS(img OSImage) nodeOptOS { return nodeOptOS(img) }
// DontJoinTailnet returns a NodeOption that prevents the node from running tailscale up.
func DontJoinTailnet() nodeOptNoTailscale { return nodeOptNoTailscale{} }
// NoAgent returns a NodeOption that skips TTA agent setup. The node will not
// have a test agent, so agent-dependent operations (Status, ExecOnNode, etc.)
// won't work. Useful for VMs that just need to boot and respond to ICMP.
func NoAgent() nodeOptNoAgent { return nodeOptNoAgent{} }
// AdvertiseRoutes returns a NodeOption that configures the node to advertise
// the given routes (comma-separated CIDRs) when joining the tailnet.
func AdvertiseRoutes(routes string) nodeOptAdvertiseRoutes {
return nodeOptAdvertiseRoutes(routes)
}
// SNATSubnetRoutes returns a NodeOption that sets whether the node should
// source NAT traffic to advertised subnet routes. The default is true.
// Setting this to false preserves original source IPs, which is needed
// for site-to-site configurations.
func SNATSubnetRoutes(v bool) nodeOptSNATSubnetRoutes { return nodeOptSNATSubnetRoutes(v) }
// WebServer returns a NodeOption that starts a webserver on the given port.
// The webserver responds with "Hello world I am <nodename> from <sourceIP>" on all requests.
func WebServer(port int) nodeOptWebServer { return nodeOptWebServer(port) }
// Start initializes the virtual network, boots all VMs in parallel, and waits
// for all TTA agents to connect. It should be called after all AddNetwork/AddNode calls.
func (e *Env) Start() {
t := e.t
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
t.Cleanup(cancel)
e.ctx = ctx
e.initNodeStatus()
e.maybeStartWebServer()
if err := os.MkdirAll(e.binDir, 0755); err != nil {
t.Fatal(err)
}
if *testVersion != "" {
v, err := resolveTestVersion(ctx, *testVersion)
if err != nil {
t.Fatalf("resolving --test-version=%q: %v", *testVersion, err)
}
e.testVersion = v
t.Logf("using Tailscale release version %s (from --test-version=%q)", v, *testVersion)
}
// Dry-run: let each platform register its steps with the web UI.
userSteps := e.steps
e.steps = nil
for _, n := range e.nodes {
n.platform().planSteps(e, n)
}
for _, n := range e.nodes {
if !n.noAgent {
e.Step("Wait for agent: " + n.name)
}
if n.joinTailnet {
e.Step("Tailscale up: " + n.name)
}
}
for _, s := range userSteps {
s.index = len(e.steps)
e.steps = append(e.steps, s)
}
// Boot all nodes in parallel. Each platform handles its own
// dependencies (image prep, binary compilation, socket setup)
// via sync.Once, so independent work overlaps naturally.
var bootEg errgroup.Group
for _, n := range e.nodes {
bootEg.Go(func() error {
return n.platform().boot(ctx, e, n)
})
}
if err := bootEg.Wait(); err != nil {
t.Fatalf("boot: %v", err)
}
// Set up agent clients and wait for all agents to connect.
for _, n := range e.nodes {
if n.noAgent {
continue
}
e.initVnet() // ensure vnet is ready for agent clients
n.agent = e.server.NodeAgentClient(n.vnetNode)
n.vnetNode.SetClient(n.agent)
}
var agentEg errgroup.Group
for _, n := range e.nodes {
if n.noAgent {
continue
}
agentEg.Go(func() error {
aStep := e.Step("Wait for agent: " + n.name)
aStep.Begin()
t.Logf("[%s] waiting for agent...", n.name)
if n.joinTailnet {
st, err := n.agent.Status(ctx)
if err != nil {
return fmt.Errorf("[%s] agent status: %w", n.name, err)
}
t.Logf("[%s] agent connected, backend state: %s", n.name, st.BackendState)
} else {
if err := e.waitForAgentConn(ctx, n); err != nil {
return fmt.Errorf("[%s] agent connect: %w", n.name, err)
}
t.Logf("[%s] agent connected (no tailscale)", n.name)
}
aStep.End(nil)
if n.vnetNode.HostFirewall() {
if err := n.agent.EnableHostFirewall(ctx); err != nil {
return fmt.Errorf("[%s] enable firewall: %w", n.name, err)
}
}
if n.joinTailnet {
tsStep := e.Step("Tailscale up: " + n.name)
tsStep.Begin()
if err := e.tailscaleUp(ctx, n); err != nil {
return fmt.Errorf("[%s] tailscale up: %w", n.name, err)
}
st2, err := n.agent.Status(ctx)
if err != nil {
return fmt.Errorf("[%s] status after up: %w", n.name, err)
}
if st2.BackendState != "Running" {
return fmt.Errorf("[%s] state = %q, want Running", n.name, st2.BackendState)
}
// Apply any capabilities for the node to the map.
// SetNodeCapMap pushes an updated map response immediately, then wait
// until the node reports the capability in its status.
if cm := n.vnetNode.WantCapMap(); cm != nil {
e.server.ControlServer().SetNodeCapMap(st2.Self.PublicKey, cm)
if err := tstest.WaitFor(15*time.Second, func() error {
st, err := n.agent.Status(ctx)
if err != nil {
return err
}
if st.Self == nil {
return fmt.Errorf("self is nil")
}
for c := range cm {
if !st.Self.HasCap(c) {
return fmt.Errorf("cap %v not yet received", c)
}
}
return nil
}); err != nil {
return fmt.Errorf("[%s] waiting for capabilities: %w", n.name, err)
}
}
ips := fmt.Sprintf("%v", st2.Self.TailscaleIPs)
e.setNodeTailscale(n.name, "Running "+ips)
t.Logf("[%s] up with %v", n.name, st2.Self.TailscaleIPs)
tsStep.End(nil)
}
return nil
})
}
if err := agentEg.Wait(); err != nil {
t.Fatal(err)
}
// Start webservers.
for _, n := range e.nodes {
if n.webServerPort > 0 {
if err := e.startWebServer(ctx, n); err != nil {
t.Fatalf("startWebServer(%s): %v", n.name, err)
}
}
}
}
// tailscaleUp runs "tailscale up" on the node via TTA.
func (e *Env) tailscaleUp(ctx context.Context, n *Node) error {
url := "http://unused/up?accept-routes=true"
if n.advertiseRoutes != "" {
url += "&advertise-routes=" + n.advertiseRoutes
}
if n.snatSubnetRoutes != nil {
if *n.snatSubnetRoutes {
url += "&snat-subnet-routes=true"
} else {
url += "&snat-subnet-routes=false"
}
}
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
res, err := n.agent.HTTPClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
body, _ := io.ReadAll(res.Body)
if res.StatusCode != 200 {
return fmt.Errorf("tailscale up: %s: %s", res.Status, body)
}
return nil
}
// startWebServer tells TTA on the node to start a webserver.
func (e *Env) startWebServer(ctx context.Context, n *Node) error {
url := fmt.Sprintf("http://unused/start-webserver?port=%d&name=%s", n.webServerPort, n.name)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
res, err := n.agent.HTTPClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
if res.StatusCode != 200 {
body, _ := io.ReadAll(res.Body)
return fmt.Errorf("start-webserver: %s: %s", res.Status, body)
}
e.t.Logf("[%s] webserver started on port %d", n.name, n.webServerPort)
return nil
}
// SetExitNode sets the client node's exit node to use for internet traffic.
// If exitNode is nil, the client's exit node is cleared (i.e., turned off).
// Otherwise exitNode must be a tailnet node with an approved 0.0.0.0/0 (and
// ::/0) route, typically configured via [AdvertiseRoutes] and
// [Env.ApproveRoutes].
func (e *Env) SetExitNode(client, exitNode *Node) {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
var ip netip.Addr
if exitNode != nil {
st, err := exitNode.agent.Status(ctx)
if err != nil {
e.t.Fatalf("SetExitNode: status for %s: %v", exitNode.name, err)
}
if len(st.Self.TailscaleIPs) == 0 {
e.t.Fatalf("SetExitNode: %s has no Tailscale IPs", exitNode.name)
}
ip = st.Self.TailscaleIPs[0]
}
if _, err := client.agent.EditPrefs(ctx, &ipn.MaskedPrefs{
Prefs: ipn.Prefs{
ExitNodeID: "",
ExitNodeIP: ip,
},
ExitNodeIDSet: true,
ExitNodeIPSet: true,
}); err != nil {
e.t.Fatalf("SetExitNode(%s -> %v): %v", client.name, exitNode, err)
}
if exitNode == nil {
e.t.Logf("[%s] cleared exit node", client.name)
} else {
e.t.Logf("[%s] using exit node %s (%v)", client.name, exitNode.name, ip)
}
}
// SetExitNodeIP sets the client's ExitNodeIP preference directly, by IP.
// This is the right helper for plain-WireGuard exit nodes (Mullvad-style)
// that aren't on the tailnet — pass an invalid netip.Addr{} to clear.
// For tailnet exit nodes whose Tailscale IP is discoverable via TTA, use
// [Env.SetExitNode] instead.
func (e *Env) SetExitNodeIP(client *Node, ip netip.Addr) {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if _, err := client.agent.EditPrefs(ctx, &ipn.MaskedPrefs{
Prefs: ipn.Prefs{
ExitNodeID: "",
ExitNodeIP: ip,
},
ExitNodeIDSet: true,
ExitNodeIPSet: true,
}); err != nil {
e.t.Fatalf("SetExitNodeIP(%s, %v): %v", client.name, ip, err)
}
if !ip.IsValid() {
e.t.Logf("[%s] cleared exit node", client.name)
} else {
e.t.Logf("[%s] using exit-node IP %v", client.name, ip)
}
}
// ControlServer returns the underlying test control server, for tests that
// need to inject custom peers, masquerade pairs, etc. The returned server's
// Node store is shared with the running tailnet, so changes take effect on
// the next netmap update sent to peers.
func (e *Env) ControlServer() *testcontrol.Server {
return e.server.ControlServer()
}
// BringUpMullvadWGServer brings up a userspace WireGuard server on n,
// configured as a single-peer "Mullvad-style" exit-node target. The
// server runs inside n's TTA process on a Linux TUN named "wg0".
//
// gw is the WG interface address (e.g. 10.64.0.1/24). The server listens
// on listenPort, accepts only the single peer whose public key is peerPub
// at peerAllowedIP, and MASQUERADEs egress traffic from masqSrc so that
// decrypted packets from the peer egress with n's WAN IP.
//
// It returns the freshly generated public key of the WG server, which
// the caller must pin as the peer key on the [tailcfg.Node] it injects
// into the netmap to advertise this server as a plain-WireGuard exit
// node. It fatals the test on error.
func (e *Env) BringUpMullvadWGServer(n *Node, gw netip.Prefix, listenPort uint16, peerPub key.NodePublic, peerAllowedIP, masqSrc netip.Prefix) key.NodePublic {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
peerPubRaw := peerPub.Raw32()
v := url.Values{
"addr": {gw.String()},
"listen-port": {strconv.Itoa(int(listenPort))},
"peer-pub-b64": {base64.StdEncoding.EncodeToString(peerPubRaw[:])},
"peer-allowed-ip": {peerAllowedIP.String()},
"masq-src": {masqSrc.String()},
}
reqURL := "http://unused/wg-server-up?" + v.Encode()
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
if err != nil {
e.t.Fatalf("BringUpMullvadWGServer: %v", err)
}
res, err := n.agent.HTTPClient.Do(req)
if err != nil {
e.t.Fatalf("BringUpMullvadWGServer(%s): %v", n.name, err)
}
defer res.Body.Close()
body, _ := io.ReadAll(res.Body)
if res.StatusCode != 200 {
e.t.Fatalf("BringUpMullvadWGServer(%s): %s: %s", n.name, res.Status, body)
}
var pubB64 string
for _, line := range strings.Split(string(body), "\n") {
if s, ok := strings.CutPrefix(strings.TrimSpace(line), "PUBKEY="); ok {
pubB64 = s
break
}
}
if pubB64 == "" {
e.t.Fatalf("BringUpMullvadWGServer(%s): no PUBKEY in response: %q", n.name, body)
}
pubRaw, err := base64.StdEncoding.DecodeString(pubB64)
if err != nil || len(pubRaw) != 32 {
e.t.Fatalf("BringUpMullvadWGServer(%s): bad PUBKEY %q: %v", n.name, pubB64, err)
}
return key.NodePublicFromRaw32(mem.B(pubRaw))
}
// Status returns the tailscale status of the given node, fetched from its
// TTA agent. It fatals the test on error.
func (e *Env) Status(n *Node) *ipnstate.Status {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
st, err := n.agent.Status(ctx)
if err != nil {
e.t.Fatalf("Status(%s): %v", n.name, err)
}
return st
}
// ClientMetrics returns the client metrics exported by the given node.
func (e *Env) ClientMetrics(n *Node) ClientMetrics {
e.t.Helper()
raw, err := n.Agent().DaemonMetrics(e.t.Context())
if err != nil {
e.t.Fatalf("Node %q DaemonMetrics: %v", n.Name(), err)
}
// Metrics are reported in Prometheus exposition format.
var parser expfmt.TextParser
mfs, err := parser.TextToMetricFamilies(bytes.NewReader(raw))
if err != nil {
e.t.Fatalf("Node %q parse client metrics: %v", n.Name(), err)
}
// Tailscale client metrics are all unlabelled integer-valued counters and
// gauges, so we don't need to handle the full generality of the Prometheus
// representation. If we see anything else, we'll log and skip it.
out := make(ClientMetrics)
for _, mf := range mfs {
name := mf.GetName()
if _, ok := out[name]; ok {
e.t.Logf("Node %q: duplicate client metric %q (ignored)", n.Name(), name)
continue
} else if len(mf.Metric) != 1 {
e.t.Logf("Node %q: got %d values for client metric %q, want 1 (ignored)", n.Name(), len(mf.Metric), name)
continue
}
var mtype string
var value int64
switch mf.GetType() {
case dto.MetricType_COUNTER:
mtype = "counter"
value = int64(mf.Metric[0].GetCounter().GetValue())
case dto.MetricType_GAUGE:
mtype = "gauge"
value = int64(mf.Metric[0].GetGauge().GetValue())
default:
e.t.Logf("Node %q unexpected client metric %q type %q (ignored)", n.Name(), name, mf.GetType().String())
continue
}
out[name] = ClientMetric{
Name: name,
Type: mtype,
Value: value,
}
}
return out
}
// ClientMetrics is a view of the client metrics exported by a node.
// The keys of the map are the metric names.
type ClientMetrics map[string]ClientMetric
// ClientMetric is a view of a node client metric.
type ClientMetric struct {
Name string // as published to the clientmetrics package
Type string // either "gauge" or "counter"
Value int64 // the gauge or counter value
}
// SetAcceptRoutes toggles the node's RouteAll preference (the
// --accept-routes flag), controlling whether it installs subnet routes
// advertised by peers.
func (e *Env) SetAcceptRoutes(n *Node, on bool) {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if _, err := n.agent.EditPrefs(ctx, &ipn.MaskedPrefs{
Prefs: ipn.Prefs{RouteAll: on},
RouteAllSet: true,
}); err != nil {
e.t.Fatalf("SetAcceptRoutes(%s, %v): %v", n.name, on, err)
}
e.t.Logf("[%s] accept-routes=%v", n.name, on)
}
// ApproveRoutes tells the test control server to approve subnet routes
// for the given node. The routes should be CIDR strings.
func (e *Env) ApproveRoutes(n *Node, routes ...string) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// Get the node's public key from its status.
st, err := n.agent.Status(ctx)
if err != nil {
e.t.Fatalf("ApproveRoutes: status for %s: %v", n.name, err)
}
nodeKey := st.Self.PublicKey
var prefixes []netip.Prefix
for _, r := range routes {
p, err := netip.ParsePrefix(r)
if err != nil {
e.t.Fatalf("ApproveRoutes: bad route %q: %v", r, err)
}
prefixes = append(prefixes, p)
}
// Enable --accept-routes on all other tailscale nodes BEFORE setting the
// routes on the control server. This way, when the map update arrives with
// the new peer routes, peers will immediately install them.
for _, other := range e.nodes {
if other == n || !other.joinTailnet {
continue
}
if _, err := other.agent.EditPrefs(ctx, &ipn.MaskedPrefs{
Prefs: ipn.Prefs{RouteAll: true},
RouteAllSet: true,
}); err != nil {
e.t.Fatalf("ApproveRoutes: set accept-routes on %s: %v", other.name, err)
}
}
// Approve the routes on the control server. SetSubnetRoutes notifies all
// peers via updatePeerChanged, so they'll re-fetch their MapResponse.
e.server.ControlServer().SetSubnetRoutes(nodeKey, prefixes)
// Wait for each peer to see the routes.
for _, r := range routes {
for _, other := range e.nodes {
if other == n || !other.joinTailnet {
continue
}
if !e.waitForPeerRoute(other, r, 15*time.Second) {
e.DumpStatus(other)
e.t.Fatalf("ApproveRoutes: %s never saw route %s", other.name, r)
}
}
}
e.t.Logf("approved routes %v on %s", routes, n.name)
// Ping the advertiser from each peer to establish WireGuard tunnels.
for _, other := range e.nodes {
if other == n || !other.joinTailnet {
continue
}
e.ping(other, n)
}
}
// ping does a disco ping from one node to another's Tailscale IP, retrying
// for up to 30 seconds, fataling on failure. It is used internally to wake
// up magicsock peer state before a test runs; tests that want to assert
// connectivity should use [Env.Ping] with the appropriate ping type and
// timeout.
func (e *Env) ping(from, to *Node) {
e.t.Helper()
if err := e.Ping(from, to, tailcfg.PingDisco, 30*time.Second); err != nil {
e.t.Fatal(err)
}
}
// Ping pings from one node to another's Tailscale IP using the given ping
// type, retrying until it succeeds or timeout expires. It returns the error
// from the last attempt if the timeout expires. Unlike the internal ping
// helper, it does not fatal the test on failure; callers can check the error
// to assert on timing.
//
// [tailcfg.PingTSMP] actually flows packets across the WireGuard tunnel and is
// the right choice for asserting end-to-end connectivity.
// [tailcfg.PingDisco] only exchanges disco messages between magicsock layers
// and is useful for warming up peer state without requiring a working tunnel.
func (e *Env) Ping(from, to *Node, ptype tailcfg.PingType, timeout time.Duration) error {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
toSt, err := to.agent.Status(ctx)
if err != nil {
return fmt.Errorf("ping: can't get %s status: %w", to.name, err)
}
if len(toSt.Self.TailscaleIPs) == 0 {
return fmt.Errorf("ping: %s has no Tailscale IPs", to.name)
}
targetIP := toSt.Self.TailscaleIPs[0]
var lastErr error
for {
// Per-attempt timeout: cap at 3s but never exceed the remaining budget.
attemptTimeout := 3 * time.Second
if d := time.Until(deadline(ctx)); d < attemptTimeout {
attemptTimeout = d
}
if attemptTimeout <= 0 {
break
}
pingCtx, pingCancel := context.WithTimeout(ctx, attemptTimeout)
pr, err := from.agent.PingWithOpts(pingCtx, targetIP, ptype, local.PingOpts{})
pingCancel()
if err == nil && pr.Err == "" {
e.logVerbosef("ping(%s): %s -> %s OK", ptype, from.name, targetIP)
return nil
}
switch {
case err != nil:
lastErr = err
case pr.Err != "":
lastErr = fmt.Errorf("%s", pr.Err)
}
if ctx.Err() != nil {
break
}
time.Sleep(500 * time.Millisecond)
}
if lastErr == nil {
lastErr = ctx.Err()
}
return fmt.Errorf("ping(%s): %s -> %s (%s) timed out after %v: %w", ptype, from.name, to.name, targetIP, timeout, lastErr)
}
// deadline returns ctx's deadline, or a zero Time if it has none.
func deadline(ctx context.Context) time.Time {
d, _ := ctx.Deadline()
return d
}
// PeerDiscoKey returns n's view of the given peer's disco key. It returns a
// non-nil error if the LocalAPI request fails (e.g. tailscaled briefly
// unavailable during a restart). It returns (zero, false, nil) if n is
// reachable but has no record of the given peer in its current netmap.
//
// PeerDiscoKey is suitable for use inside a [tstest.WaitFor] poll loop: it
// does not fatal the test on transient errors.
//
// The disco key is fetched from the debug-only "peer-disco-keys" LocalAPI
// action ([ipnlocal.LocalBackend.DebugPeerDiscoKeys]) rather than via
// [ipnstate.Status], to keep the production PeerStatus struct free of disco
// keys (and free of non-comparable fields like [key.DiscoPublic] that break
// reflect-based test helpers).
func (e *Env) PeerDiscoKey(n *Node, peer key.NodePublic) (key.DiscoPublic, bool, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
got, err := n.agent.DebugResultJSON(ctx, "peer-disco-keys")
if err != nil {
return key.DiscoPublic{}, false, err
}
// DebugResultJSON returns the result as a generic any (the body is
// re-decoded into any), so the map comes back keyed by string text-
// encoded node keys. Re-marshal+unmarshal into a typed map for cleaner
// lookup. (Roundtripping through JSON is fine for a test helper.)
raw, err := json.Marshal(got)
if err != nil {
return key.DiscoPublic{}, false, fmt.Errorf("re-marshal: %w", err)
}
var m map[key.NodePublic]key.DiscoPublic
if err := json.Unmarshal(raw, &m); err != nil {
return key.DiscoPublic{}, false, fmt.Errorf("unmarshal peer-disco-keys: %w", err)
}
d, ok := m[peer]
return d, ok, nil
}
// RotateDiscoKey asks tailscaled on n to rotate its discovery (magicsock) key
// in place via the LocalAPI debug action. The node key, control connection,
// and other tailscaled state are unaffected. It fatals the test on error.
func (e *Env) RotateDiscoKey(n *Node) {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := n.agent.DebugAction(ctx, "rotate-disco-key"); err != nil {
e.t.Fatalf("RotateDiscoKey(%s): %v", n.name, err)
}
}
// RestartTailscaled signals tailscaled on n to die so that its supervisor
// (gokrazy) restarts it. It then waits for tailscaled to come back to the
// "Running" backend state. It fatals the test on error.
//
// Restarting tailscaled is currently only supported on gokrazy nodes.
func (e *Env) RestartTailscaled(n *Node) {
e.t.Helper()
if !n.os.IsGokrazy {
e.t.Fatalf("RestartTailscaled(%s): only supported on gokrazy nodes (have %q)", n.name, n.os.Name)
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/restart-tailscaled", nil)
if err != nil {
e.t.Fatalf("RestartTailscaled(%s): %v", n.name, err)
}
res, err := n.agent.HTTPClient.Do(req)
if err != nil {
e.t.Fatalf("RestartTailscaled(%s): %v", n.name, err)
}
body, _ := io.ReadAll(res.Body)
res.Body.Close()
if res.StatusCode != 200 {
e.t.Fatalf("RestartTailscaled(%s): %s: %s", n.name, res.Status, body)
}
e.t.Logf("[%s] %s", n.name, strings.TrimSpace(string(body)))
// Wait for tailscaled to come back. Status calls will fail while the unix
// socket is gone, then return Starting/NeedsLogin briefly before settling
// on Running.
if err := tstest.WaitFor(45*time.Second, func() error {
st, err := n.agent.Status(ctx)
if err != nil {
return err
}
if st.BackendState != "Running" {
return fmt.Errorf("backend state = %q", st.BackendState)
}
return nil
}); err != nil {
e.t.Fatalf("RestartTailscaled(%s): waiting for Running: %v", n.name, err)
}
}
// AddRoute adds a kernel static route on the given node, pointing prefix at
// via. It uses TTA's /add-route handler, so it works on any node where TTA
// is running (which is all of them — DontJoinTailnet only skips
// `tailscale up`; the agent runs regardless). Currently Linux-only in TTA.
//
// It fatals the test on error.
func (e *Env) AddRoute(n *Node, prefix, via string) {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
reqURL := fmt.Sprintf("http://unused/add-route?prefix=%s&via=%s", prefix, via)
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
if err != nil {
e.t.Fatalf("AddRoute: %v", err)
}
resp, err := n.agent.HTTPClient.Do(req)
if err != nil {
e.t.Fatalf("AddRoute(%s, %s → %s): %v", n.name, prefix, via, err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode != 200 {
e.t.Fatalf("AddRoute(%s, %s → %s): %s: %s", n.name, prefix, via, resp.Status, body)
}
}
// SSHExec runs a command on a cloud VM via its debug SSH NIC.
// Only works for cloud VMs that have the debug NIC and SSH key configured.
// Returns stdout and any error.
func (e *Env) SSHExec(n *Node, cmd string) (string, error) {
if n.sshPort == 0 {
return "", fmt.Errorf("node %s has no SSH debug port", n.name)
}
sshCmd := exec.Command("ssh",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ConnectTimeout=5",
"-i", "/tmp/vmtest_key",
"-p", fmt.Sprintf("%d", n.sshPort),
"root@127.0.0.1",
cmd)
out, err := sshCmd.CombinedOutput()
return string(out), err
}
// DumpStatus logs the tailscale status of a node, including its peers and their
// AllowedIPs. Useful for debugging routing issues.
func (e *Env) DumpStatus(n *Node) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
st, err := n.agent.Status(ctx)
if err != nil {
e.t.Logf("[%s] DumpStatus error: %v", n.name, err)
return
}
var selfAllowed []string
if st.Self.AllowedIPs != nil {
for i := range st.Self.AllowedIPs.Len() {
selfAllowed = append(selfAllowed, st.Self.AllowedIPs.At(i).String())
}
}
var selfPrimary []string
if st.Self.PrimaryRoutes != nil {
for i := range st.Self.PrimaryRoutes.Len() {
selfPrimary = append(selfPrimary, st.Self.PrimaryRoutes.At(i).String())
}
}
e.t.Logf("[%s] self: %v, backend=%s, AllowedIPs=%v, PrimaryRoutes=%v", n.name, st.Self.TailscaleIPs, st.BackendState, selfAllowed, selfPrimary)
for _, peer := range st.Peer {
var aips []string
if peer.AllowedIPs != nil {
for i := range peer.AllowedIPs.Len() {
aips = append(aips, peer.AllowedIPs.At(i).String())
}
}
e.t.Logf("[%s] peer %s (%s): AllowedIPs=%v, Online=%v, Relay=%q, CurAddr=%q",
n.name, peer.HostName, peer.TailscaleIPs,
aips, peer.Online, peer.Relay, peer.CurAddr)
}
}
// waitForPeerRoute polls the node's status until it sees the given route prefix
// in a peer's AllowedIPs, or until timeout. Returns true if found.
func (e *Env) waitForPeerRoute(n *Node, prefix string, timeout time.Duration) bool {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
for {
st, err := n.agent.Status(ctx)
if err != nil {
return false
}
for _, peer := range st.Peer {
if peer.AllowedIPs != nil {
for i := range peer.AllowedIPs.Len() {
if peer.AllowedIPs.At(i).String() == prefix {
return true
}
}
}
}
if ctx.Err() != nil {
return false
}
time.Sleep(time.Second)
}
}
// HTTPGet makes an HTTP GET request from the given node to the specified URL.
// The request is proxied through TTA's /http-get handler.
func (e *Env) HTTPGet(from *Node, targetURL string) string {
for attempt := range 3 {
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Second)
reqURL := "http://unused/http-get?url=" + targetURL
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
if err != nil {
cancel()
e.t.Fatalf("HTTPGet: %v", err)
}
res, err := from.agent.HTTPClient.Do(req)
cancel()
if err != nil {
e.logVerbosef("HTTPGet attempt %d from %s: %v", attempt+1, from.name, err)
continue
}
body, _ := io.ReadAll(res.Body)
res.Body.Close()
if res.StatusCode == http.StatusBadGateway || res.StatusCode == http.StatusServiceUnavailable {
e.t.Logf("HTTPGet attempt %d from %s: status %d, body: %s", attempt+1, from.name, res.StatusCode, string(body))
time.Sleep(2 * time.Second)
continue
}
return string(body)
}
e.t.Fatalf("HTTPGet from %s to %s: all attempts failed", from.name, targetURL)
return ""
}
// setNodeScreenshot stores the latest screenshot data URI for a node.
func (e *Env) setNodeScreenshot(name, dataURI string) {
e.nodeStatusMu.Lock()
if ns := e.nodeStatus[name]; ns != nil {
ns.Screenshot = dataURI
}
e.nodeStatusMu.Unlock()
}
// setNodeScreenshotPort stores the Host.app screenshot server port for a node.
func (e *Env) setNodeScreenshotPort(name string, port int) {
e.nodeStatusMu.Lock()
if ns := e.nodeStatus[name]; ns != nil {
ns.ScreenshotPort = port
}
e.nodeStatusMu.Unlock()
}
// nodeScreenshotPort returns the Host.app screenshot server port for a node, or 0.
func (e *Env) nodeScreenshotPort(name string) int {
e.nodeStatusMu.Lock()
defer e.nodeStatusMu.Unlock()
if ns := e.nodeStatus[name]; ns != nil {
return ns.ScreenshotPort
}
return 0
}
// initVnet creates the vnet server. Called once via sync.Once.
func (e *Env) initVnet() {
e.vnetOnce.Do(func() {
var err error
e.server, err = vnet.New(&e.cfg)
if err != nil {
e.t.Fatalf("vnet.New: %v", err)
}
e.t.Cleanup(func() { e.server.Close() })
e.server.SetDHCPCallback(func(mac vnet.MAC, nodeNum int, msgType layers.DHCPMsgType, ip netip.Addr) {
name := e.nodeNameByNum(nodeNum)
nicIdx := e.nicIndexForMAC(name, mac)
ipStr := ip.String()
switch msgType {
case layers.DHCPMsgTypeDiscover:
e.setNodeDHCP(name, nicIdx, "Discover sent")
e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPDiscover, Message: "DHCP Discover sent", NIC: nicIdx})
case layers.DHCPMsgTypeOffer:
e.setNodeDHCP(name, nicIdx, "Offered "+ipStr)
e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPOffer, Message: "DHCP Offer received", Detail: ipStr, NIC: nicIdx})
case layers.DHCPMsgTypeRequest:
e.setNodeDHCP(name, nicIdx, "Requesting "+ipStr)
e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPRequest, Message: "DHCP Request sent", Detail: ipStr, NIC: nicIdx})
case layers.DHCPMsgTypeAck:
e.setNodeDHCP(name, nicIdx, "Got "+ipStr)
e.eventBus.Publish(VMEvent{NodeName: name, Type: EventDHCPAck, Message: "DHCP Ack: got " + ipStr, Detail: ipStr, NIC: nicIdx})
}
})
if e.sameTailnetUser {
e.server.ControlServer().AllNodesSameUser = true
}
if e.allOnline {
e.server.ControlServer().AllOnline = true
}
if e.peerRelayGrants {
e.server.ControlServer().PeerRelayGrants = true
}
})
}
// ensureQEMUSocket creates the Unix stream socket for QEMU VMs. Called once.
func (e *Env) ensureQEMUSocket() {
e.qemuSockOnce.Do(func() {
e.initVnet()
e.sockAddr = filepath.Join(e.sockDir, "vnet.sock")
srv, err := net.Listen("unix", e.sockAddr)
if err != nil {
e.t.Fatalf("listen unix: %v", err)
}
e.t.Cleanup(func() { srv.Close() })
go func() {
for {
c, err := srv.Accept()
if err != nil {
return
}
go e.server.ServeUnixConn(c.(*net.UnixConn), vnet.ProtocolQEMU)
}
}()
})
}
// ensureDgramSocket creates the Unix dgram socket for macOS VMs. Called once.
func (e *Env) ensureDgramSocket() {
e.dgramSockOnce.Do(func() {
e.initVnet()
e.dgramSockAddr = filepath.Join(e.sockDir, "dgram.sock")
dgramAddr, err := net.ResolveUnixAddr("unixgram", e.dgramSockAddr)
if err != nil {
e.t.Fatalf("resolve dgram addr: %v", err)
}
uc, err := net.ListenUnixgram("unixgram", dgramAddr)
if err != nil {
e.t.Fatalf("listen unixgram: %v", err)
}
e.t.Cleanup(func() { uc.Close() })
go e.server.ServeUnixConn(uc, vnet.ProtocolUnixDGRAM)
})
}
// ensureCompiled compiles binaries for the given platform and registers them
// with the vnet file server. Safe for concurrent use; only compiles once per platform.
func (e *Env) ensureCompiled(ctx context.Context, goos, goarch string) {
key := goos + "_" + goarch
e.compileMu.Lock()
once, ok := e.compileOnce[key]
if !ok {
once = new(sync.Once)
mak.Set(&e.compileOnce, key, once)
}
e.compileMu.Unlock()
once.Do(func() {
step := e.Step(fmt.Sprintf("Compile %s_%s binaries", goos, goarch))
step.Begin()
if err := e.compileBinariesForOS(ctx, goos, goarch); err != nil {
step.End(err)
e.t.Fatalf("compileBinariesForOS(%s, %s): %v", goos, goarch, err)
}
step.End(nil)
e.registerBinaries(goos, goarch)
})
}
// ensureImage prepares the cloud image for os and returns any error from the
// preparation. Safe for concurrent use; only prepares once per OS name.
func (e *Env) ensureImage(ctx context.Context, os OSImage) error {
e.compileMu.Lock()
once, ok := e.imageOnce[os.Name]
if !ok {
once = new(sync.Once)
mak.Set(&e.imageOnce, os.Name, once)
}
e.compileMu.Unlock()
var err error
once.Do(func() {
step := e.Step(fmt.Sprintf("Prepare %s image", os.Name))
step.Begin()
err = ensureImage(ctx, os)
step.End(err)
})
return err
}
// registerBinaries registers compiled binaries with the vnet file server.
// Safe for concurrent use.
func (e *Env) registerBinaries(goos, goarch string) {
e.initVnet()
dir := goos + "_" + goarch
for _, name := range []string{"tta", "tailscale", "tailscaled"} {
data, err := os.ReadFile(filepath.Join(e.binDir, dir, name))
if err != nil {
e.t.Fatalf("reading compiled %s/%s: %v", dir, name, err)
}
e.server.RegisterFile(dir+"/"+name, data)
}
}
// waitForAgentConn waits for a TTA agent to connect by issuing a simple
// HTTP GET to the root endpoint, without requiring tailscaled.
func (e *Env) waitForAgentConn(ctx context.Context, n *Node) error {
for {
reqCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
req, err := http.NewRequestWithContext(reqCtx, "GET", "http://unused/", nil)
if err != nil {
cancel()
return err
}
res, err := n.agent.HTTPClient.Do(req)
cancel()
if err == nil {
res.Body.Close()
return nil
}
if ctx.Err() != nil {
return ctx.Err()
}
time.Sleep(500 * time.Millisecond)
}
}
// Agent returns the node's TTA agent client, or nil if NoAgent is set.
func (n *Node) Agent() *vnet.NodeAgentClient {
return n.agent
}
// LANPing pings a LAN IP from the given node using TTA's /ping endpoint.
// It retries for up to 2 minutes, which is enough for a macOS VM to boot
// and acquire a DHCP lease.
func (e *Env) LANPing(from *Node, targetIP netip.Addr) {
if from.agent == nil {
e.t.Fatalf("LANPing: node %s has no agent (NoAgent set?)", from.name)
}
e.t.Logf("LANPing: %s -> %s", from.name, targetIP)
deadline := time.Now().Add(2 * time.Minute)
for attempt := 0; time.Now().Before(deadline); attempt++ {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
reqURL := fmt.Sprintf("http://unused/ping?host=%s", targetIP)
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
if err != nil {
cancel()
e.t.Fatalf("LANPing: %v", err)
}
res, err := from.agent.HTTPClient.Do(req)
cancel()
if err != nil {
if attempt%10 == 0 {
e.t.Logf("LANPing attempt %d: %v", attempt+1, err)
}
time.Sleep(2 * time.Second)
continue
}
body, _ := io.ReadAll(res.Body)
res.Body.Close()
if res.StatusCode == 200 {
e.t.Logf("LANPing: %s -> %s succeeded on attempt %d", from.name, targetIP, attempt+1)
return
}
if attempt%10 == 0 {
e.t.Logf("LANPing attempt %d: status %d, body: %s", attempt+1, res.StatusCode, string(body))
}
time.Sleep(2 * time.Second)
}
e.t.Fatalf("LANPing: %s -> %s timed out after 2 minutes", from.name, targetIP)
}
// SendTaildropFile sends a file via Taildrop from one node to another.
// The to node must be on the tailnet. It fatals on error.
func (e *Env) SendTaildropFile(from, to *Node, name string, content []byte) {
e.t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
st, err := to.agent.Status(ctx)
if err != nil {
e.t.Fatalf("SendTaildropFile: status for %s: %v", to.name, err)
}
if len(st.Self.TailscaleIPs) == 0 {
e.t.Fatalf("SendTaildropFile: %s has no Tailscale IPs", to.name)
}
target := st.Self.TailscaleIPs[0].String()
reqURL := fmt.Sprintf("http://unused/taildrop-send?to=%s&name=%s", target, name)
req, err := http.NewRequestWithContext(ctx, "POST", reqURL, bytes.NewReader(content))
if err != nil {
e.t.Fatalf("SendTaildropFile: %v", err)
}
res, err := from.agent.HTTPClient.Do(req)
if err != nil {
e.t.Fatalf("SendTaildropFile(%s -> %s): %v", from.name, to.name, err)
}
defer res.Body.Close()
body, _ := io.ReadAll(res.Body)
if res.StatusCode != 200 {
e.t.Fatalf("SendTaildropFile(%s -> %s): %s: %s", from.name, to.name, res.Status, body)
}
if msg := strings.TrimSpace(string(body)); msg != "" {
e.t.Logf("[%s] %s", from.name, msg)
}
e.t.Logf("[%s] sent Taildrop %q (%d bytes) to %s", from.name, name, len(content), to.name)
}
// RecvTaildropFile waits for an incoming Taildrop file on the node and
// returns the filename and contents. The provided context bounds the wait;
// in addition, RecvTaildropFile imposes its own 90s upper bound. It fatals
// on error or timeout.
func (e *Env) RecvTaildropFile(ctx context.Context, n *Node) (name string, content []byte) {
e.t.Helper()
ctx, cancel := context.WithTimeout(ctx, 90*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/taildrop-recv", nil)
if err != nil {
e.t.Fatalf("RecvTaildropFile: %v", err)
}
res, err := n.agent.HTTPClient.Do(req)
if err != nil {
e.t.Fatalf("RecvTaildropFile(%s): %v", n.name, err)
}
defer res.Body.Close()
body, _ := io.ReadAll(res.Body)
if res.StatusCode != 200 {
e.t.Fatalf("RecvTaildropFile(%s): %s: %s", n.name, res.Status, body)
}
name = res.Header.Get("Taildrop-Filename")
e.t.Logf("[%s] received Taildrop %q (%d bytes)", n.name, name, len(body))
return name, body
}
var buildGokrazy sync.Once
// ensureGokrazy builds the gokrazy base image (once per test process) and
// locates the kernel. The build is fast (~4s) so we always rebuild to ensure
// the baked-in binaries (tta, tailscale, tailscaled) match the current source.
func (e *Env) ensureGokrazy(ctx context.Context) error {
if e.gokrazyBase != "" {
return nil // already found
}
modRoot, err := findModRoot()
if err != nil {
return err
}
var buildErr error
buildGokrazy.Do(func() {
e.t.Logf("building gokrazy natlab image...")
cmd := exec.CommandContext(ctx, "make", "natlab")
cmd.Dir = filepath.Join(modRoot, "gokrazy")
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout
if err := cmd.Run(); err != nil {
buildErr = fmt.Errorf("make natlab: %w", err)
}
})
if buildErr != nil {
return buildErr
}
e.gokrazyBase = filepath.Join(modRoot, "gokrazy/natlabapp.qcow2")
kernel, err := findKernelPath(filepath.Join(modRoot, "go.mod"))
if err != nil {
return fmt.Errorf("finding kernel: %w", err)
}
e.gokrazyKernel = kernel
return nil
}
// compileBinariesForOS prepares the tta, tailscale, and tailscaled binaries
// for the given GOOS/GOARCH and places them in e.binDir/<goos>_<goarch>/.
//
// tta is always built from the local source tree (the test agent must match
// the test framework). When --test-version is set, tailscale and tailscaled
// are taken from the downloaded release tarball instead of being compiled
// from source.
func (e *Env) compileBinariesForOS(ctx context.Context, goos, goarch string) error {
modRoot, err := findModRoot()
if err != nil {
return err
}
dir := goos + "_" + goarch
outDir := filepath.Join(e.binDir, dir)
if err := os.MkdirAll(outDir, 0755); err != nil {
return err
}
// Use downloaded release binaries only on Linux: pkgs.tailscale.com only
// publishes Linux tarballs, so other GOOS values still build from source.
useDownloaded := e.testVersion != "" && goos == "linux"
type binary struct{ name, pkg string }
buildBins := []binary{{"tta", "./cmd/tta"}}
if !useDownloaded {
buildBins = append(buildBins,
binary{"tailscale", "./cmd/tailscale"},
binary{"tailscaled", "./cmd/tailscaled"})
}
var eg errgroup.Group
for _, bin := range buildBins {
eg.Go(func() error {
outPath := filepath.Join(outDir, bin.name)
e.t.Logf("compiling %s/%s...", dir, bin.name)
cmd := exec.CommandContext(ctx, "go", "build", "-o", outPath, bin.pkg)
cmd.Dir = modRoot
cmd.Env = append(os.Environ(), "GOOS="+goos, "GOARCH="+goarch, "CGO_ENABLED=0")
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("building %s/%s: %v\n%s", dir, bin.name, err, out)
}
e.t.Logf("compiled %s/%s", dir, bin.name)
return nil
})
}
if useDownloaded {
eg.Go(func() error {
srcDir, err := ensureVersionBinaries(ctx, e.testVersion, goarch, e.t.Logf)
if err != nil {
return err
}
for _, name := range []string{"tailscale", "tailscaled"} {
if err := copyFile(filepath.Join(srcDir, name), filepath.Join(outDir, name), 0755); err != nil {
return fmt.Errorf("staging %s/%s: %w", dir, name, err)
}
}
e.t.Logf("staged version %s tailscale & tailscaled for %s", e.testVersion, dir)
return nil
})
}
return eg.Wait()
}
// copyFile copies src to dst with the given permission bits.
func copyFile(src, dst string, perm os.FileMode) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
return writeAtomic(dst, in, perm)
}
// findModRoot returns the root of the Go module (where go.mod is).
func findModRoot() (string, error) {
out, err := exec.Command("go", "env", "GOMOD").CombinedOutput()
if err != nil {
return "", fmt.Errorf("go env GOMOD: %w", err)
}
gomod := strings.TrimSpace(string(out))
if gomod == "" || gomod == os.DevNull {
return "", fmt.Errorf("not in a Go module")
}
return filepath.Dir(gomod), nil
}
// findKernelPath finds the gokrazy kernel vmlinuz path from go.mod.
func findKernelPath(goMod string) (string, error) {
// Import the same logic as nat_test.go.
b, err := os.ReadFile(goMod)
if err != nil {
return "", err
}
goModCacheB, err := exec.Command("go", "env", "GOMODCACHE").CombinedOutput()
if err != nil {
return "", err
}
goModCache := strings.TrimSpace(string(goModCacheB))
// Parse go.mod to find gokrazy-kernel version.
for _, line := range strings.Split(string(b), "\n") {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "github.com/tailscale/gokrazy-kernel") {
parts := strings.Fields(line)
if len(parts) >= 2 {
return filepath.Join(goModCache, parts[0]+"@"+parts[1], "vmlinuz"), nil
}
}
}
return "", fmt.Errorf("gokrazy-kernel not found in %s", goMod)
}
// PingRoute describes what connection type was used to transfer a Disco ping.
type PingRoute string
const (
PingRouteDirect PingRoute = "direct"
PingRouteDERP PingRoute = "derp"
PingRouteLocal PingRoute = "local"
PingRouteNil PingRoute = "nil"
)
// classifyPing finds what kind of route has been used on a ping path.
// It is only really relevant for DiscoPings.
func classifyPing(pr *ipnstate.PingResult) PingRoute {
if pr == nil {
return PingRouteNil
}
if pr.Endpoint == "" {
return PingRouteDERP
}
ap, err := netip.ParseAddrPort(pr.Endpoint)
if err == nil && ap.Addr().IsPrivate() {
return PingRouteLocal
}
return PingRouteDirect
}
// PingExpect retries disco pings until the result matches wantRoute or the
// timeout is reached. It is using DiscoPings as this is the only ping type
// that can classify the connection type.
func (e *Env) PingExpect(from, to *Node, wantRoute PingRoute, timeout time.Duration) error {
e.t.Helper()
ctx, cancel := context.WithTimeout(e.t.Context(), timeout)
defer cancel()
var lastRoute PingRoute
toSt, err := to.agent.Status(ctx)
if err != nil {
return fmt.Errorf("ping: can't get %s status: %w", to.name, err)
}
if len(toSt.Self.TailscaleIPs) == 0 {
return fmt.Errorf("ping: %s has no Tailscale IPs", to.name)
}
targetIP := toSt.Self.TailscaleIPs[0]
for ctx.Err() == nil {
pingCtx, pingCancel := context.WithTimeout(ctx, 3*time.Second)
pr, err := from.agent.PingWithOpts(pingCtx, targetIP, tailcfg.PingDisco, local.PingOpts{})
pingCancel()
if err == nil && pr.Err == "" {
if got := classifyPing(pr); got == wantRoute {
e.t.Logf("Saw ping type %q", got)
return nil
} else {
e.t.Logf("Saw ping type %q", got)
lastRoute = got
}
}
select {
case <-time.After(500 * time.Millisecond):
case <-ctx.Done():
}
}
return fmt.Errorf("ping route = %q, want %q (after %v)", lastRoute, wantRoute, timeout)
}
// NumNodes returns the current number of nodes configured in the env.
func (env *Env) NumNodes() int {
return len(env.nodes)
}