tstest/natlab, .github/workflows: add opt-in natlab CI workflow

The natlab vmtest suite (tstest/natlab/vmtest) and the integration nat tests are gated behind --run-vm-tests because they need KVM and are slow. Until now nothing in CI exercised them apart from a single canary TestEasyEasy run on every PR. Add .github/workflows/natlab-test.yml that runs the full opt-in suite on demand (workflow_dispatch), on PRs labeled "natlab", and on main every 12 hours via cron. The workflow has two phases: - "prepare" builds the gokrazy VM image, downloads the Ubuntu and FreeBSD cloud images once via the new natlabprep tool, and emits a dynamic JSON matrix of every TestX function it finds in the two opt-in packages. - "test" is a per-test matrix that depends on prepare. Each matrix job restores the shared caches and runs a single test, so adding a new TestFoo is automatically picked up on the next run without any workflow edits. Rename the existing natlab-integrationtest.yml to natlab-basic.yml since it's the small smoke variant (just TestEasyEasy on every PR); the new natlab-test.yml is the bigger suite. The job inside is renamed to EasyEasy for the same reason. Move the macOS arm64 host check from vmtest.Env.Start into vmtest.Env.AddNode so a test that adds a vmtest.MacOS node skips immediately on a non-macOS host, and add an explicit skipIfNotMacOSArm64 helper at the top of the two macOS-only tests so the platform requirement is obvious to readers. Quiet the takeAgentConnOne miss log in tstest/natlab/vnet by default (it was the overwhelming majority of bytes in CI logs, with no signal in healthy runs) and replace it with a periodic "still waiting" line that only fires after 10s, so a truly stuck agent connection still surfaces. Updates #13038 Change-Id: I4582098d8865200fd5a73a9b696942319ccf3bf0 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2026-05-06 20:07:45 +00:00
parent 4eec4423b4
commit e062b46984
9 changed files with 454 additions and 78 deletions
@@ -1,6 +1,7 @@
-# Run some natlab integration tests.
+# Run a single natlab smoke test on every PR. The full natlab suite
 # is opt-in and lives in .github/workflows/natlab-test.yml.
 # See https://github.com/tailscale/tailscale/issues/13038
-name: "natlab-integrationtest"
+name: "natlab-basic"
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,7 +18,7 @@ on:
    branches:
      - "main"
 jobs:
-  natlab-integrationtest:
+  EasyEasy:
    runs-on: ubuntu-latest
    steps:
      - name: Check out code
@@ -0,0 +1,182 @@
 # Run the full natlab/vmtest opt-in test suite. These tests boot QEMU VMs
 # (gokrazy, Ubuntu, FreeBSD) and exercise vnet-driven networking scenarios.
 # They are gated behind --run-vm-tests because they need KVM and are slow.
 #
 # This workflow runs:
 #   - on demand (workflow_dispatch)
 #   - on PRs that carry the "natlab" label
 #   - on main, every 12 hours, via cron
 #
 # Layout:
 #   - "prepare" builds the gokrazy VM image, downloads the cloud images
 #     (Ubuntu, FreeBSD), and discovers every Test* function in the two
 #     opt-in packages.
 #   - "test" is a per-TestFoo matrix that depends on prepare. Each matrix
 #     job restores the shared caches and runs a single test. Adding a new
 #     TestFoo automatically gets its own job — no workflow edits needed.
 #
 # A separate workflow (.github/workflows/natlab-basic.yml) runs a single
 # canary natlab test on every PR; this one runs the full suite.
 name: "natlab-test"
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 on:
  workflow_dispatch:
  pull_request:
    types: [labeled, synchronize, reopened]
  schedule:
    # Every 12 hours, off-the-hour to avoid GitHub's :00 cron-stampede window.
    - cron: "23 3,15 * * *"
 jobs:
  # prepare warms the per-workflow-run caches (gokrazy image, cloud VM
  # images) and emits the dynamic matrix of test names. By doing the work
  # once here, the matrix test jobs never race to rebuild or re-download
  # the same artifacts on a cold cache.
  prepare:
    if: |
      github.event_name == 'workflow_dispatch' ||
      github.event_name == 'schedule' ||
      (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'natlab'))
    runs-on: ubuntu-latest
    timeout-minutes: 30
    outputs:
      matrix: ${{ steps.list.outputs.matrix }}
    steps:
      - name: Check out code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # The cloud VM image cache is keyed only on images.go (image URLs and
      # SHAs), so it survives across workflow runs and is invalidated only
      # when a new image source is added.
      - name: Cache cloud VM images
        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: ~/.cache/tailscale/vmtest/images
          key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }}
      # The gokrazy VM image is keyed by github.sha. That means we rebuild
      # it once per commit but matrix test jobs in the same run all share
      # the result. Per-PR re-runs of the same sha (e.g. a rerun-failed)
      # also get the cache.
      - name: Cache gokrazy VM image
        id: gokrazy-cache
        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: gokrazy/natlabapp.qcow2
          key: natlab-gokrazy-${{ github.sha }}
      # qemu-utils provides qemu-img, which the gokrazy Makefile uses to
      # convert natlabapp.img to qcow2. Only install if we need it (cache
      # miss); the test matrix jobs install qemu separately for the runtime.
      - name: Install qemu-utils
        if: steps.gokrazy-cache.outputs.cache-hit != 'true'
        run: |
          sudo rm -f /var/lib/man-db/auto-update
          sudo apt-get -y update
          sudo apt-get -y remove man-db
          sudo apt-get install -y qemu-utils
      - name: Download cloud VM images
        # natlabprep is idempotent: it checks the cache before downloading.
        run: |
          ./tool/go run ./tstest/natlab/vmtest/cmd/natlabprep
      - name: Build gokrazy VM image
        if: steps.gokrazy-cache.outputs.cache-hit != 'true'
        run: |
          make -C gokrazy natlab
      - name: Discover tests
        id: list
        # Grep the test files directly rather than invoking `go test -list`
        # so we don't pay the cost of compiling the test binaries here. The
        # only test functions in these packages use the canonical
        # `func TestFoo(t *testing.T)` signature.
        #
        # exclude is the set of tests that need special invocation
        # (extra flags, a specific environment) and don't fit the
        # single-test-per-matrix-job model. They stay runnable locally.
        run: |
          set -euo pipefail
          exclude='^(TestGrid)$'
          tmp=$(mktemp)
          for pkg_dir in tstest/natlab/vmtest tstest/integration/nat; do
            pkg="./${pkg_dir}/"
            for f in "${pkg_dir}"/*_test.go; do
              [ -e "$f" ] || continue
              grep -hE '^func Test[A-Z][A-Za-z0-9_]*\(t \*testing\.T\)' "$f" \
                | sed -E 's/^func (Test[A-Za-z0-9_]+).*/\1/' \
                | grep -vE "$exclude" \
                | while read -r t; do
                    jq -nc --arg pkg "$pkg" --arg test "$t" \
                      '{pkg: $pkg, test: $test}' >> "$tmp"
                  done
            done
          done
          matrix=$(jq -s -c . "$tmp")
          echo "matrix=${matrix}" >> "$GITHUB_OUTPUT"
          echo "Discovered tests:"
          jq . "$tmp"
  test:
    needs: prepare
    runs-on: ubuntu-latest
    timeout-minutes: 20
    name: "${{ matrix.test }}"
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJson(needs.prepare.outputs.matrix) }}
    steps:
      - name: Check out code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Enable KVM
        run: |
          echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules
          sudo udevadm control --reload-rules
          sudo udevadm trigger --name-match=kvm
      - name: Install qemu
        run: |
          sudo rm -f /var/lib/man-db/auto-update
          sudo apt-get -y update
          sudo apt-get -y remove man-db
          sudo apt-get install -y qemu-system-x86 qemu-utils
      # restore-only: prepare is the single writer of these caches, so
      # matrix jobs don't write back. fail-on-cache-miss would be too
      # strict for the gokrazy cache (e.g. a non-fatal cache eviction
      # between prepare and us); we just rebuild on miss instead.
      - name: Restore cloud VM images
        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: ~/.cache/tailscale/vmtest/images
          key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }}
      - name: Restore gokrazy VM image
        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: gokrazy/natlabapp.qcow2
          key: natlab-gokrazy-${{ github.sha }}
      # The gokrazy-based tests boot the kernel directly from
      # vmlinuz that ships in the tailscale/gokrazy-kernel module.
      # Tests look it up under GOMODCACHE via findKernelPath, so the
      # module has to be present even though no Go source imports it
      # in the test package itself.
      - name: Download gokrazy-kernel module
        run: |
          ./tool/go mod download github.com/tailscale/gokrazy-kernel
      - name: Run ${{ matrix.test }}
        # Per-test timeout is well above the few-minute typical runtime
        # but small enough that a stuck test fails fast instead of holding
        # the runner for the job's 20-minute budget.
        run: |
          ./tool/go test -v -timeout=15m -count=1 ${{ matrix.pkg }} \
            -run='^${{ matrix.test }}$' --run-vm-tests
@@ -173,16 +173,21 @@ func (e *Env) generateFreeBSDUserData(n *Node) string {
 		ud.WriteString("  - \"sysctl net.inet6.ip6.forwarding=1\"\n")
 	}
-	// Start tailscaled and tta in the background.
+	// Start tailscaled and tta in the background. Redirect stdio to log
-	// Set PATH to include /usr/local/bin so that tta can find "tailscale"
+	// files and away from /dev/null on stdin; otherwise nuageinit's runcmd
-	// (TTA uses exec.Command("tailscale", ...) without a full path).
+	// executor keeps the backgrounded child's stdout/stderr pipes open and
-	// --statedir provides a VarRoot so features like Taildrop have a directory.
+	// blocks waiting for them, so subsequent runcmd entries (including the
 	// tta launch below) never run. Linux cloud-init doesn't have this
 	// gotcha. Set PATH to include /usr/local/bin so that tta can find
 	// "tailscale" (TTA uses exec.Command("tailscale", ...) without a full
 	// path). --statedir provides a VarRoot so features like Taildrop have a
 	// directory.
 	ud.WriteString("  - \"mkdir -p /var/lib/tailscale\"\n")
-	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tailscaled --state=mem: --statedir=/var/lib/tailscale &\"\n")
+	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tailscaled --state=mem: --statedir=/var/lib/tailscale </dev/null >/var/log/tailscaled.log 2>&1 &\"\n")
 	ud.WriteString("  - \"sleep 2\"\n")
-	// Start tta (Tailscale Test Agent).
+	// Start tta (Tailscale Test Agent), with the same stdio redirection.
-	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tta &\"\n")
+	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tta </dev/null >/var/log/tta.log 2>&1 &\"\n")
 	return ud.String()
 }
@@ -0,0 +1,24 @@
 // Copyright (c) Tailscale Inc & contributors
 // SPDX-License-Identifier: BSD-3-Clause
 // The natlabprep tool warms the local natlab vmtest cache by downloading
 // every cloud VM image natlab can boot. It is intended for CI prep steps
 // so a subsequent test run does not pay the per-image download cost.
 package main
 import (
 	"context"
 	"log"
 	"tailscale.com/tstest/natlab/vmtest"
 )
 func main() {
 	ctx := context.Background()
 	for _, img := range vmtest.CloudImages() {
 		log.Printf("ensuring %s ...", img.Name)
 		if err := vmtest.EnsureImage(ctx, img); err != nil {
 			log.Fatalf("ensuring %s: %v", img.Name, err)
 		}
 	}
 }
@@ -91,6 +91,22 @@ var (
 	}
 )
 // CloudImages returns the set of QEMU-bootable cloud OS images natlab can
 // use for vmtests, excluding gokrazy (built from source) and macOS (which
 // uses a separate snapshot pipeline). It is intended for tooling such as
 // a CI prep step that wants to warm the image cache.
 func CloudImages() []OSImage {
 	return []OSImage{Ubuntu2404, Debian12, FreeBSD150}
 }
 // EnsureImage downloads img to the local cache if not already present.
 // It is intended for tooling that wants to warm the image cache before
 // running natlab vmtests (e.g. a CI prep step). The test framework also
 // calls into the package-internal equivalent on demand.
 func EnsureImage(ctx context.Context, img OSImage) error {
 	return ensureImage(ctx, img)
 }
 // imageCacheDir returns the directory for cached VM images.
 func imageCacheDir() string {
 	if d := os.Getenv("VMTEST_CACHE_DIR"); d != "" {
@@ -15,6 +15,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
 	"testing"
 	"time"
 	"tailscale.com/tstest/natlab/vnet"
@@ -192,22 +193,98 @@ func (e *Env) startCloudQEMU(n *Node) error {
 	return nil
 }
-// launchQEMU starts a qemu-system-x86_64 process with the given args.
+// qemuRun is one running qemu-system-x86_64 process plus the file handles
 // the wrapping code holds open on its behalf. kill tears the whole thing
 // down (used both for normal cleanup and for the in-flight retry path).
 type qemuRun struct {
 	cmd        *exec.Cmd
 	parentPipe *os.File
 	devNull    *os.File
 	qemuLog    *os.File
 }
 func (r *qemuRun) kill() {
 	killProcessTree(r.cmd)
 	r.cmd.Wait()
 	r.parentPipe.Close()
 	r.devNull.Close()
 	r.qemuLog.Close()
 }
 // launchQEMU starts a qemu-system-x86_64 process with the given args and
 // watches for console activity. If the guest produces no output within
 // stuckTimeout (empty console *and* QEMU has not exited with an error),
 // the QEMU process is killed and re-launched. This works around CI
 // hypervisor flakes seen on shared GitHub Actions runners where a QEMU
 // process starts but its vCPU never makes any forward progress (the
 // failure presents as both the virtconsole log and the QEMU stderr log
 // being zero bytes after many minutes, with the vnet stream socket
 // connected but no packet ever sent).
 //
 // VM console output goes to logPath (via QEMU's -serial or -chardev).
 // QEMU's own stdout/stderr go to logPath.qemu for diagnostics.
 func (e *Env) launchQEMU(name, logPath string, args []string) error {
 	// stuckTimeout is generous: a healthy VM prints SeaBIOS/kernel
 	// output within ~1-2s on KVM, but slow shared CI hardware can lag.
 	// Setting it too low risks killing a healthy-but-slow VM; setting it
 	// too high masks the wedge case we want to recover from.
 	const stuckTimeout = 45 * time.Second
 	const maxAttempts = 3
 	var lastErr error
 	for attempt := 1; attempt <= maxAttempts; attempt++ {
 		if attempt > 1 {
 			e.t.Logf("[%s] QEMU made no progress in %v; killing and retrying (attempt %d/%d)", name, stuckTimeout, attempt, maxAttempts)
 			// QEMU's -chardev file backend opens append-mode, so stale
 			// bytes from a previous attempt would falsely trip the
 			// progress check on retry. Truncate it.
 			os.Truncate(logPath, 0)
 		}
 		run, err := e.startQEMUOnce(name, logPath, args)
 		if err != nil {
 			lastErr = err
 			continue
 		}
 		if waitForConsoleProgress(logPath, stuckTimeout) {
 			e.qemuProcs = append(e.qemuProcs, run.cmd)
 			if e.ctx != nil {
 				go e.tailLogFile(e.ctx, name, logPath)
 			}
 			e.t.Cleanup(func() {
 				run.kill()
 				// Dump tail of VM log and QEMU's own stderr on failure.
 				// The console log (logPath) is empty when the guest never
 				// produced output (e.g. QEMU exited before the kernel ran);
 				// in that case the .qemu file holds the only diagnostic —
 				// KVM errors, "kvm not available", CPU model mismatch, etc.
 				if e.t.Failed() {
 					dumpLogTail(e.t, name, "console", logPath)
 					dumpLogTail(e.t, name, "qemu stderr", logPath+".qemu")
 				}
 			})
 			return nil
 		}
 		lastErr = fmt.Errorf("QEMU for %s produced no console output in %v", name, stuckTimeout)
 		run.kill()
 	}
 	return fmt.Errorf("QEMU for %s failed after %d attempts: %w", name, maxAttempts, lastErr)
 }
 // startQEMUOnce starts a single qemu-system-x86_64 process. On success the
 // returned qemuRun owns the process and all file handles; the caller must
 // invoke kill (either inline for a retry or via t.Cleanup for the
 // surviving attempt).
 func (e *Env) startQEMUOnce(name, logPath string, args []string) (*qemuRun, error) {
 	cmd := exec.Command("qemu-system-x86_64", args...)
 	// Send stdout/stderr to the log file for any QEMU diagnostic messages.
 	// Stdin must be /dev/null to prevent QEMU from trying to read.
 	devNull, err := os.Open(os.DevNull)
 	if err != nil {
-		return fmt.Errorf("open /dev/null: %w", err)
+		return nil, fmt.Errorf("open /dev/null: %w", err)
 	}
 	cmd.Stdin = devNull
 	qemuLog, err := os.Create(logPath + ".qemu")
 	if err != nil {
 		devNull.Close()
-		return err
+		return nil, err
 	}
 	cmd.Stdout = qemuLog
 	cmd.Stderr = qemuLog
@@ -215,45 +292,69 @@ func (e *Env) launchQEMU(name, logPath string, args []string) error {
 	if err != nil {
 		devNull.Close()
 		qemuLog.Close()
-		return fmt.Errorf("killWithParent: %w", err)
+		return nil, fmt.Errorf("killWithParent: %w", err)
 	}
 	if err := cmd.Start(); err != nil {
 		parentPipe.Close()
 		devNull.Close()
 		qemuLog.Close()
-		return fmt.Errorf("qemu for %s: %w", name, err)
+		return nil, fmt.Errorf("qemu for %s: %w", name, err)
 	}
 	e.t.Logf("launched QEMU for %s (pid %d), log: %s", name, cmd.Process.Pid, logPath)
-	e.qemuProcs = append(e.qemuProcs, cmd)
+	return &qemuRun{
-
+		cmd:        cmd,
-	// Start tailing the VM console log for the web UI.
+		parentPipe: parentPipe,
-	if e.ctx != nil {
+		devNull:    devNull,
-		go e.tailLogFile(e.ctx, name, logPath)
+		qemuLog:    qemuLog,
-	}
+	}, nil
 	e.t.Cleanup(func() {
 		killProcessTree(cmd)
 		cmd.Wait()
 		parentPipe.Close()
 		devNull.Close()
 		qemuLog.Close()
 		// Dump tail of VM log on failure for debugging.
 		if e.t.Failed() {
 			if data, err := os.ReadFile(logPath); err == nil {
 				lines := bytes.Split(data, []byte("\n"))
 				start := 0
 				if len(lines) > 50 {
 					start = len(lines) - 50
 				}
 				e.t.Logf("=== last 50 lines of %s log ===", name)
 				for _, line := range lines[start:] {
 					e.t.Logf("[%s] %s", name, line)
 				}
 			}
 		}
 	})
 	return nil
 }
 // waitForConsoleProgress polls logPath until its size is non-zero or
 // timeout elapses. It returns true on observed forward progress (any
 // bytes written), false on timeout.
 func waitForConsoleProgress(logPath string, timeout time.Duration) bool {
 	deadline := time.Now().Add(timeout)
 	for time.Now().Before(deadline) {
 		if fi, err := os.Stat(logPath); err == nil && fi.Size() > 0 {
 			return true
 		}
 		time.Sleep(200 * time.Millisecond)
 	}
 	return false
 }
 // dumpLogTail prints the last 50 lines of the file at path to the test log,
 // prefixed with the VM name and kind (e.g. "console", "qemu stderr"). It is
 // a no-op (with a short note) if the file can't be read or is empty, so
 // callers can use it unconditionally on test failure.
 func dumpLogTail(t testing.TB, name, kind, path string) {
 	t.Helper()
 	data, err := os.ReadFile(path)
 	if err != nil {
 		t.Logf("=== %s %s log unavailable: %v ===", name, kind, err)
 		return
 	}
 	if len(data) == 0 {
 		t.Logf("=== %s %s log is empty ===", name, kind)
 		return
 	}
 	lines := bytes.Split(data, []byte("\n"))
 	start := 0
 	if len(lines) > 50 {
 		start = len(lines) - 50
 	}
 	t.Logf("=== last 50 lines of %s %s log ===", name, kind)
 	for _, line := range lines[start:] {
 		t.Logf("[%s] %s", name, line)
 	}
 }
 // hostFwdRe matches a single TCP[HOST_FORWARD] line from QEMU's
 // "info usernet" human-monitor command output, e.g.:
 //
 //	TCP[HOST_FORWARD]  12       127.0.0.1 35323       10.0.2.15    22
 var hostFwdRe = regexp.MustCompile(`TCP\[HOST_FORWARD\]\s+\d+\s+127\.0\.0\.1\s+(\d+)\s+`)
 // qmpQueryHostFwd connects to a QEMU QMP socket and queries the host port
 // assigned to the first TCP host forward rule (the SSH debug port).
 func qmpQueryHostFwd(sockPath string) (int, error) {
@@ -271,7 +372,7 @@ func qmpQueryHostFwd(sockPath string) (int, error) {
 		return 0, fmt.Errorf("QMP socket %s not available", sockPath)
 	}
 	defer conn.Close()
-	conn.SetDeadline(time.Now().Add(5 * time.Second))
+	conn.SetDeadline(time.Now().Add(20 * time.Second))
 	// Read the QMP greeting.
 	var greeting json.RawMessage
@@ -287,23 +388,30 @@ func qmpQueryHostFwd(sockPath string) (int, error) {
 		return 0, fmt.Errorf("reading qmp_capabilities response: %w", err)
 	}
-	// Query "info usernet" via human-monitor-command.
+	// Poll "info usernet" until the SLIRP host-forward rule appears.
-	fmt.Fprintf(conn, `{"execute":"human-monitor-command","arguments":{"command-line":"info usernet"}}`+"\n")
+	// On slow runners (e.g. GitHub Actions) QEMU sometimes returns an
-	var hmpResp struct {
+	// empty "info usernet" if we query it before user-mode networking
-		Return string `json:"return"`
+	// has finished wiring up the forward, so single-shot lookups fail.
 	deadline := time.Now().Add(10 * time.Second)
 	var lastReturn string
 	for {
 		fmt.Fprintf(conn, `{"execute":"human-monitor-command","arguments":{"command-line":"info usernet"}}`+"\n")
 		var hmpResp struct {
 			Return string `json:"return"`
 		}
 		if err := dec.Decode(&hmpResp); err != nil {
 			return 0, fmt.Errorf("reading info usernet response: %w", err)
 		}
 		lastReturn = hmpResp.Return
 		if m := hostFwdRe.FindStringSubmatch(hmpResp.Return); m != nil {
 			return strconv.Atoi(m[1])
 		}
 		if time.Now().After(deadline) {
 			break
 		}
 		time.Sleep(100 * time.Millisecond)
 	}
-	if err := dec.Decode(&hmpResp); err != nil {
+	return 0, fmt.Errorf("no hostfwd port found after waiting: %q", lastReturn)
 		return 0, fmt.Errorf("reading info usernet response: %w", err)
 	}
 	// Parse the port from output like:
 	//   TCP[HOST_FORWARD]  12       127.0.0.1 35323       10.0.2.15    22
 	re := regexp.MustCompile(`TCP\[HOST_FORWARD\]\s+\d+\s+127\.0\.0\.1\s+(\d+)\s+`)
 	m := re.FindStringSubmatch(hmpResp.Return)
 	if m == nil {
 		return 0, fmt.Errorf("no hostfwd port found in: %s", hmpResp.Return)
 	}
 	return strconv.Atoi(m[1])
 }
 // tailLogFile tails a VM's serial console log file and publishes each line
@@ -420,6 +420,13 @@ func (e *Env) AddNode(name string, opts ...any) *Node {
 		}
 	}
 	// macOS VMs require a macOS arm64 host (Apple Virtualization.framework via
 	// tailmac). Skip the test now rather than letting it proceed through the
 	// rest of the setup only to fail later.
 	if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") {
 		e.t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH)
 	}
 	n.vnetNode = e.cfg.AddNode(vnetOpts...)
 	n.num = n.vnetNode.Num()
 	return n
@@ -504,12 +511,6 @@ func (e *Env) Start() {
 		t.Logf("using Tailscale release version %s (from --test-version=%q)", v, *testVersion)
 	}
 	for _, n := range e.nodes {
 		if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") {
 			t.Skip("macOS VM tests require macOS arm64 host")
 		}
 	}
 	// Dry-run: let each platform register its steps with the web UI.
 	userSteps := e.steps
 	e.steps = nil
@@ -7,6 +7,7 @@ import (
 	"bytes"
 	"fmt"
 	"net/netip"
 	"runtime"
 	"strings"
 	"testing"
 	"time"
@@ -21,7 +22,20 @@ import (
 	"tailscale.com/types/netmap"
 )
 // skipIfNotMacOSArm64 skips the test when the host isn't a macOS arm64 host.
 // macOS VM tests require Apple Virtualization.framework via tailmac.
 // AddNode also enforces this when a macOS node is added, but having an
 // explicit skip at the top of macOS-only tests makes the requirement
 // obvious to readers.
 func skipIfNotMacOSArm64(t *testing.T) {
 	t.Helper()
 	if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" {
 		t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH)
 	}
 }
 func TestMacOSAndLinuxCanPing(t *testing.T) {
 	skipIfNotMacOSArm64(t)
 	env := vmtest.New(t)
 	lan := env.AddNetwork("192.168.1.1/24")
@@ -39,6 +53,7 @@ func TestMacOSAndLinuxCanPing(t *testing.T) {
 }
 func TestTwoMacOSVMsCanPing(t *testing.T) {
 	skipIfNotMacOSArm64(t)
 	env := vmtest.New(t)
 	lan := env.AddNetwork("192.168.1.1/24")
@@ -969,8 +984,18 @@ func TestCachedNetmapAfterRestart(t *testing.T) {
 	}
 	netmapCheckStep.End(nil)
 	// 90s is generous on purpose. After both nodes restart with stale cached
 	// netmap entries, a's first WG handshake to b's pre-restart endpoint
 	// hits the dead NAT mapping on b's side and is silently dropped (we
 	// see this as "no recent outgoing packet" NAT drops in the vnet log).
 	// Recovery then waits on wireguard-go's REKEY_TIMEOUT (~5s) before the
 	// next handshake attempt, and on disco-via-DERP to teach each side the
 	// other's new endpoint. On an idle host this converges in well under
 	// 15s; on a contended host (a 14/16-CPU-loaded local repro, or any
 	// shared CI runner) the same sequence has been observed at 50-60s
 	// because every timer fires multiple times under scheduling jitter.
 	pingStep.Begin()
-	if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil {
+	if err := env.Ping(a, b, tailcfg.PingTSMP, 90*time.Second); err != nil {
 		pingStep.End(err)
 		t.Fatal(err)
 	}
@@ -2583,14 +2583,24 @@ func (s *Server) addIdleAgentConn(ac *agentConn) {
 func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok bool) {
 	const debug = false
 	// stuckThreshold is how long we wait before deciding the agent is slow
 	// enough to warrant a log line. Below this we stay quiet because, in
 	// healthy runs with many agent dials in flight, even a few-millisecond
 	// wait would otherwise log every poll for every concurrent waiter.
 	const stuckThreshold = 10 * time.Second
 	start := time.Now()
 	var lastWarn time.Time
 	for {
-		ac, ok := s.takeAgentConnOne(n)
+		ac, miss := s.takeAgentConnOne(n)
-		if ok {
+		if ac != nil {
 			if debug {
 				log.Printf("takeAgentConn: got agent conn for %v", n.mac)
 			}
 			return ac, true
 		}
 		if debug && miss > 0 {
 			log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac)
 		}
 		s.mu.Lock()
 		ready := make(chan struct{})
 		mak.Set(&s.agentConnWaiter, n, ready)
@@ -2599,6 +2609,10 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b
 		if debug {
 			log.Printf("takeAgentConn: waiting for agent conn for %v", n.mac)
 		}
 		if elapsed := time.Since(start); elapsed > stuckThreshold && time.Since(lastWarn) > stuckThreshold {
 			log.Printf("takeAgentConn: still waiting for agent conn for %v after %v (%d idle conns for other nodes)", n.mac, elapsed.Round(time.Second), miss)
 			lastWarn = time.Now()
 		}
 		select {
 		case <-ctx.Done():
 			return nil, false
@@ -2611,21 +2625,21 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b
 	}
 }
-func (s *Server) takeAgentConnOne(n *node) (_ *agentConn, ok bool) {
+// takeAgentConnOne returns an idle agent conn for n if one is available,
 // otherwise nil. miss is the number of idle agent conns for other nodes that
 // were walked over while looking; the caller may use it for diagnostics when
 // a wait drags on.
 func (s *Server) takeAgentConnOne(n *node) (ac *agentConn, miss int) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	miss := 0
 	for ac := range s.agentConns {
 		if ac.node == n {
 			s.agentConns.Delete(ac)
-			return ac, true
+			return ac, 0
 		}
 		miss++
 	}
-	if miss > 0 {
+	return nil, miss
 		log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac)
 	}
 	return nil, false
 }
 type NodeAgentClient struct {