tstest/natlab, .github/workflows: add opt-in natlab CI workflow

The natlab vmtest suite (tstest/natlab/vmtest) and the integration nat tests are gated behind --run-vm-tests because they need KVM and are slow. Until now nothing in CI exercised them apart from a single canary TestEasyEasy run on every PR. Add .github/workflows/natlab-test.yml that runs the full opt-in suite on demand (workflow_dispatch), on PRs labeled "natlab", and on main every 12 hours via cron. The workflow has two phases: - "prepare" builds the gokrazy VM image, downloads the Ubuntu and FreeBSD cloud images once via the new natlabprep tool, and emits a dynamic JSON matrix of every TestX function it finds in the two opt-in packages. - "test" is a per-test matrix that depends on prepare. Each matrix job restores the shared caches and runs a single test, so adding a new TestFoo is automatically picked up on the next run without any workflow edits. Rename the existing natlab-integrationtest.yml to natlab-basic.yml since it's the small smoke variant (just TestEasyEasy on every PR); the new natlab-test.yml is the bigger suite. The job inside is renamed to EasyEasy for the same reason. Move the macOS arm64 host check from vmtest.Env.Start into vmtest.Env.AddNode so a test that adds a vmtest.MacOS node skips immediately on a non-macOS host, and add an explicit skipIfNotMacOSArm64 helper at the top of the two macOS-only tests so the platform requirement is obvious to readers. Quiet the takeAgentConnOne miss log in tstest/natlab/vnet by default (it was the overwhelming majority of bytes in CI logs, with no signal in healthy runs) and replace it with a periodic "still waiting" line that only fires after 10s, so a truly stuck agent connection still surfaces. Updates #13038 Change-Id: I4582098d8865200fd5a73a9b696942319ccf3bf0 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2026-05-06 20:07:45 +00:00
parent 4eec4423b4
commit e062b46984
9 changed files with 454 additions and 78 deletions
@@ -1,6 +1,7 @@
-# Run some natlab integration tests.
+# Run a single natlab smoke test on every PR. The full natlab suite
+# is opt-in and lives in .github/workflows/natlab-test.yml.
 # See https://github.com/tailscale/tailscale/issues/13038
-name: "natlab-integrationtest"
+name: "natlab-basic"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,7 +18,7 @@ on:
    branches:
      - "main"
 jobs:
-  natlab-integrationtest:
+  EasyEasy:
    runs-on: ubuntu-latest
    steps:
      - name: Check out code
@@ -0,0 +1,182 @@
+# Run the full natlab/vmtest opt-in test suite. These tests boot QEMU VMs
+# (gokrazy, Ubuntu, FreeBSD) and exercise vnet-driven networking scenarios.
+# They are gated behind --run-vm-tests because they need KVM and are slow.
+#
+# This workflow runs:
+#   - on demand (workflow_dispatch)
+#   - on PRs that carry the "natlab" label
+#   - on main, every 12 hours, via cron
+#
+# Layout:
+#   - "prepare" builds the gokrazy VM image, downloads the cloud images
+#     (Ubuntu, FreeBSD), and discovers every Test* function in the two
+#     opt-in packages.
+#   - "test" is a per-TestFoo matrix that depends on prepare. Each matrix
+#     job restores the shared caches and runs a single test. Adding a new
+#     TestFoo automatically gets its own job — no workflow edits needed.
+#
+# A separate workflow (.github/workflows/natlab-basic.yml) runs a single
+# canary natlab test on every PR; this one runs the full suite.
+name: "natlab-test"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [labeled, synchronize, reopened]
+  schedule:
+    # Every 12 hours, off-the-hour to avoid GitHub's :00 cron-stampede window.
+    - cron: "23 3,15 * * *"
+
+jobs:
+  # prepare warms the per-workflow-run caches (gokrazy image, cloud VM
+  # images) and emits the dynamic matrix of test names. By doing the work
+  # once here, the matrix test jobs never race to rebuild or re-download
+  # the same artifacts on a cold cache.
+  prepare:
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      github.event_name == 'schedule' ||
+      (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'natlab'))
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    outputs:
+      matrix: ${{ steps.list.outputs.matrix }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      # The cloud VM image cache is keyed only on images.go (image URLs and
+      # SHAs), so it survives across workflow runs and is invalidated only
+      # when a new image source is added.
+      - name: Cache cloud VM images
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+        with:
+          path: ~/.cache/tailscale/vmtest/images
+          key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }}
+
+      # The gokrazy VM image is keyed by github.sha. That means we rebuild
+      # it once per commit but matrix test jobs in the same run all share
+      # the result. Per-PR re-runs of the same sha (e.g. a rerun-failed)
+      # also get the cache.
+      - name: Cache gokrazy VM image
+        id: gokrazy-cache
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+        with:
+          path: gokrazy/natlabapp.qcow2
+          key: natlab-gokrazy-${{ github.sha }}
+
+      # qemu-utils provides qemu-img, which the gokrazy Makefile uses to
+      # convert natlabapp.img to qcow2. Only install if we need it (cache
+      # miss); the test matrix jobs install qemu separately for the runtime.
+      - name: Install qemu-utils
+        if: steps.gokrazy-cache.outputs.cache-hit != 'true'
+        run: |
+          sudo rm -f /var/lib/man-db/auto-update
+          sudo apt-get -y update
+          sudo apt-get -y remove man-db
+          sudo apt-get install -y qemu-utils
+
+      - name: Download cloud VM images
+        # natlabprep is idempotent: it checks the cache before downloading.
+        run: |
+          ./tool/go run ./tstest/natlab/vmtest/cmd/natlabprep
+
+      - name: Build gokrazy VM image
+        if: steps.gokrazy-cache.outputs.cache-hit != 'true'
+        run: |
+          make -C gokrazy natlab
+
+      - name: Discover tests
+        id: list
+        # Grep the test files directly rather than invoking `go test -list`
+        # so we don't pay the cost of compiling the test binaries here. The
+        # only test functions in these packages use the canonical
+        # `func TestFoo(t *testing.T)` signature.
+        #
+        # exclude is the set of tests that need special invocation
+        # (extra flags, a specific environment) and don't fit the
+        # single-test-per-matrix-job model. They stay runnable locally.
+        run: |
+          set -euo pipefail
+          exclude='^(TestGrid)$'
+          tmp=$(mktemp)
+          for pkg_dir in tstest/natlab/vmtest tstest/integration/nat; do
+            pkg="./${pkg_dir}/"
+            for f in "${pkg_dir}"/*_test.go; do
+              [ -e "$f" ] || continue
+              grep -hE '^func Test[A-Z][A-Za-z0-9_]*\(t \*testing\.T\)' "$f" \
+                | sed -E 's/^func (Test[A-Za-z0-9_]+).*/\1/' \
+                | grep -vE "$exclude" \
+                | while read -r t; do
+                    jq -nc --arg pkg "$pkg" --arg test "$t" \
+                      '{pkg: $pkg, test: $test}' >> "$tmp"
+                  done
+            done
+          done
+          matrix=$(jq -s -c . "$tmp")
+          echo "matrix=${matrix}" >> "$GITHUB_OUTPUT"
+          echo "Discovered tests:"
+          jq . "$tmp"
+
+  test:
+    needs: prepare
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    name: "${{ matrix.test }}"
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJson(needs.prepare.outputs.matrix) }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Enable KVM
+        run: |
+          echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules
+          sudo udevadm control --reload-rules
+          sudo udevadm trigger --name-match=kvm
+
+      - name: Install qemu
+        run: |
+          sudo rm -f /var/lib/man-db/auto-update
+          sudo apt-get -y update
+          sudo apt-get -y remove man-db
+          sudo apt-get install -y qemu-system-x86 qemu-utils
+
+      # restore-only: prepare is the single writer of these caches, so
+      # matrix jobs don't write back. fail-on-cache-miss would be too
+      # strict for the gokrazy cache (e.g. a non-fatal cache eviction
+      # between prepare and us); we just rebuild on miss instead.
+      - name: Restore cloud VM images
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+        with:
+          path: ~/.cache/tailscale/vmtest/images
+          key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }}
+
+      - name: Restore gokrazy VM image
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+        with:
+          path: gokrazy/natlabapp.qcow2
+          key: natlab-gokrazy-${{ github.sha }}
+
+      # The gokrazy-based tests boot the kernel directly from
+      # vmlinuz that ships in the tailscale/gokrazy-kernel module.
+      # Tests look it up under GOMODCACHE via findKernelPath, so the
+      # module has to be present even though no Go source imports it
+      # in the test package itself.
+      - name: Download gokrazy-kernel module
+        run: |
+          ./tool/go mod download github.com/tailscale/gokrazy-kernel
+
+      - name: Run ${{ matrix.test }}
+        # Per-test timeout is well above the few-minute typical runtime
+        # but small enough that a stuck test fails fast instead of holding
+        # the runner for the job's 20-minute budget.
+        run: |
+          ./tool/go test -v -timeout=15m -count=1 ${{ matrix.pkg }} \
+            -run='^${{ matrix.test }}$' --run-vm-tests
@@ -173,16 +173,21 @@ func (e *Env) generateFreeBSDUserData(n *Node) string {
 		ud.WriteString("  - \"sysctl net.inet6.ip6.forwarding=1\"\n")
 	}

-	// Start tailscaled and tta in the background.
-	// Set PATH to include /usr/local/bin so that tta can find "tailscale"
-	// (TTA uses exec.Command("tailscale", ...) without a full path).
-	// --statedir provides a VarRoot so features like Taildrop have a directory.
+	// Start tailscaled and tta in the background. Redirect stdio to log
+	// files and away from /dev/null on stdin; otherwise nuageinit's runcmd
+	// executor keeps the backgrounded child's stdout/stderr pipes open and
+	// blocks waiting for them, so subsequent runcmd entries (including the
+	// tta launch below) never run. Linux cloud-init doesn't have this
+	// gotcha. Set PATH to include /usr/local/bin so that tta can find
+	// "tailscale" (TTA uses exec.Command("tailscale", ...) without a full
+	// path). --statedir provides a VarRoot so features like Taildrop have a
+	// directory.
 	ud.WriteString("  - \"mkdir -p /var/lib/tailscale\"\n")
-	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tailscaled --state=mem: --statedir=/var/lib/tailscale &\"\n")
+	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tailscaled --state=mem: --statedir=/var/lib/tailscale </dev/null >/var/log/tailscaled.log 2>&1 &\"\n")
 	ud.WriteString("  - \"sleep 2\"\n")

-	// Start tta (Tailscale Test Agent).
-	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tta &\"\n")
+	// Start tta (Tailscale Test Agent), with the same stdio redirection.
+	ud.WriteString("  - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tta </dev/null >/var/log/tta.log 2>&1 &\"\n")

 	return ud.String()
 }
@@ -0,0 +1,24 @@
+// Copyright (c) Tailscale Inc & contributors
+// SPDX-License-Identifier: BSD-3-Clause
+
+// The natlabprep tool warms the local natlab vmtest cache by downloading
+// every cloud VM image natlab can boot. It is intended for CI prep steps
+// so a subsequent test run does not pay the per-image download cost.
+package main
+
+import (
+	"context"
+	"log"
+
+	"tailscale.com/tstest/natlab/vmtest"
+)
+
+func main() {
+	ctx := context.Background()
+	for _, img := range vmtest.CloudImages() {
+		log.Printf("ensuring %s ...", img.Name)
+		if err := vmtest.EnsureImage(ctx, img); err != nil {
+			log.Fatalf("ensuring %s: %v", img.Name, err)
+		}
+	}
+}
@@ -91,6 +91,22 @@ var (
 	}
 )

+// CloudImages returns the set of QEMU-bootable cloud OS images natlab can
+// use for vmtests, excluding gokrazy (built from source) and macOS (which
+// uses a separate snapshot pipeline). It is intended for tooling such as
+// a CI prep step that wants to warm the image cache.
+func CloudImages() []OSImage {
+	return []OSImage{Ubuntu2404, Debian12, FreeBSD150}
+}
+
+// EnsureImage downloads img to the local cache if not already present.
+// It is intended for tooling that wants to warm the image cache before
+// running natlab vmtests (e.g. a CI prep step). The test framework also
+// calls into the package-internal equivalent on demand.
+func EnsureImage(ctx context.Context, img OSImage) error {
+	return ensureImage(ctx, img)
+}
+
 // imageCacheDir returns the directory for cached VM images.
 func imageCacheDir() string {
 	if d := os.Getenv("VMTEST_CACHE_DIR"); d != "" {
@@ -15,6 +15,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"testing"
 	"time"

 	"tailscale.com/tstest/natlab/vnet"
@@ -192,22 +193,98 @@ func (e *Env) startCloudQEMU(n *Node) error {
 	return nil
 }

-// launchQEMU starts a qemu-system-x86_64 process with the given args.
+// qemuRun is one running qemu-system-x86_64 process plus the file handles
+// the wrapping code holds open on its behalf. kill tears the whole thing
+// down (used both for normal cleanup and for the in-flight retry path).
+type qemuRun struct {
+	cmd        *exec.Cmd
+	parentPipe *os.File
+	devNull    *os.File
+	qemuLog    *os.File
+}
+
+func (r *qemuRun) kill() {
+	killProcessTree(r.cmd)
+	r.cmd.Wait()
+	r.parentPipe.Close()
+	r.devNull.Close()
+	r.qemuLog.Close()
+}
+
+// launchQEMU starts a qemu-system-x86_64 process with the given args and
+// watches for console activity. If the guest produces no output within
+// stuckTimeout (empty console *and* QEMU has not exited with an error),
+// the QEMU process is killed and re-launched. This works around CI
+// hypervisor flakes seen on shared GitHub Actions runners where a QEMU
+// process starts but its vCPU never makes any forward progress (the
+// failure presents as both the virtconsole log and the QEMU stderr log
+// being zero bytes after many minutes, with the vnet stream socket
+// connected but no packet ever sent).
+//
 // VM console output goes to logPath (via QEMU's -serial or -chardev).
 // QEMU's own stdout/stderr go to logPath.qemu for diagnostics.
 func (e *Env) launchQEMU(name, logPath string, args []string) error {
+	// stuckTimeout is generous: a healthy VM prints SeaBIOS/kernel
+	// output within ~1-2s on KVM, but slow shared CI hardware can lag.
+	// Setting it too low risks killing a healthy-but-slow VM; setting it
+	// too high masks the wedge case we want to recover from.
+	const stuckTimeout = 45 * time.Second
+	const maxAttempts = 3
+
+	var lastErr error
+	for attempt := 1; attempt <= maxAttempts; attempt++ {
+		if attempt > 1 {
+			e.t.Logf("[%s] QEMU made no progress in %v; killing and retrying (attempt %d/%d)", name, stuckTimeout, attempt, maxAttempts)
+			// QEMU's -chardev file backend opens append-mode, so stale
+			// bytes from a previous attempt would falsely trip the
+			// progress check on retry. Truncate it.
+			os.Truncate(logPath, 0)
+		}
+		run, err := e.startQEMUOnce(name, logPath, args)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+		if waitForConsoleProgress(logPath, stuckTimeout) {
+			e.qemuProcs = append(e.qemuProcs, run.cmd)
+			if e.ctx != nil {
+				go e.tailLogFile(e.ctx, name, logPath)
+			}
+			e.t.Cleanup(func() {
+				run.kill()
+				// Dump tail of VM log and QEMU's own stderr on failure.
+				// The console log (logPath) is empty when the guest never
+				// produced output (e.g. QEMU exited before the kernel ran);
+				// in that case the .qemu file holds the only diagnostic —
+				// KVM errors, "kvm not available", CPU model mismatch, etc.
+				if e.t.Failed() {
+					dumpLogTail(e.t, name, "console", logPath)
+					dumpLogTail(e.t, name, "qemu stderr", logPath+".qemu")
+				}
+			})
+			return nil
+		}
+		lastErr = fmt.Errorf("QEMU for %s produced no console output in %v", name, stuckTimeout)
+		run.kill()
+	}
+	return fmt.Errorf("QEMU for %s failed after %d attempts: %w", name, maxAttempts, lastErr)
+}
+
+// startQEMUOnce starts a single qemu-system-x86_64 process. On success the
+// returned qemuRun owns the process and all file handles; the caller must
+// invoke kill (either inline for a retry or via t.Cleanup for the
+// surviving attempt).
+func (e *Env) startQEMUOnce(name, logPath string, args []string) (*qemuRun, error) {
 	cmd := exec.Command("qemu-system-x86_64", args...)
-	// Send stdout/stderr to the log file for any QEMU diagnostic messages.
-	// Stdin must be /dev/null to prevent QEMU from trying to read.
 	devNull, err := os.Open(os.DevNull)
 	if err != nil {
-		return fmt.Errorf("open /dev/null: %w", err)
+		return nil, fmt.Errorf("open /dev/null: %w", err)
 	}
 	cmd.Stdin = devNull
 	qemuLog, err := os.Create(logPath + ".qemu")
 	if err != nil {
 		devNull.Close()
-		return err
+		return nil, err
 	}
 	cmd.Stdout = qemuLog
 	cmd.Stderr = qemuLog
@@ -215,45 +292,69 @@ func (e *Env) launchQEMU(name, logPath string, args []string) error {
 	if err != nil {
 		devNull.Close()
 		qemuLog.Close()
-		return fmt.Errorf("killWithParent: %w", err)
+		return nil, fmt.Errorf("killWithParent: %w", err)
 	}
 	if err := cmd.Start(); err != nil {
 		parentPipe.Close()
 		devNull.Close()
 		qemuLog.Close()
-		return fmt.Errorf("qemu for %s: %w", name, err)
+		return nil, fmt.Errorf("qemu for %s: %w", name, err)
 	}
 	e.t.Logf("launched QEMU for %s (pid %d), log: %s", name, cmd.Process.Pid, logPath)
-	e.qemuProcs = append(e.qemuProcs, cmd)
-
-	// Start tailing the VM console log for the web UI.
-	if e.ctx != nil {
-		go e.tailLogFile(e.ctx, name, logPath)
-	}
-	e.t.Cleanup(func() {
-		killProcessTree(cmd)
-		cmd.Wait()
-		parentPipe.Close()
-		devNull.Close()
-		qemuLog.Close()
-		// Dump tail of VM log on failure for debugging.
-		if e.t.Failed() {
-			if data, err := os.ReadFile(logPath); err == nil {
-				lines := bytes.Split(data, []byte("\n"))
-				start := 0
-				if len(lines) > 50 {
-					start = len(lines) - 50
-				}
-				e.t.Logf("=== last 50 lines of %s log ===", name)
-				for _, line := range lines[start:] {
-					e.t.Logf("[%s] %s", name, line)
-				}
-			}
-		}
-	})
-	return nil
+	return &qemuRun{
+		cmd:        cmd,
+		parentPipe: parentPipe,
+		devNull:    devNull,
+		qemuLog:    qemuLog,
+	}, nil
 }

+// waitForConsoleProgress polls logPath until its size is non-zero or
+// timeout elapses. It returns true on observed forward progress (any
+// bytes written), false on timeout.
+func waitForConsoleProgress(logPath string, timeout time.Duration) bool {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		if fi, err := os.Stat(logPath); err == nil && fi.Size() > 0 {
+			return true
+		}
+		time.Sleep(200 * time.Millisecond)
+	}
+	return false
+}
+
+// dumpLogTail prints the last 50 lines of the file at path to the test log,
+// prefixed with the VM name and kind (e.g. "console", "qemu stderr"). It is
+// a no-op (with a short note) if the file can't be read or is empty, so
+// callers can use it unconditionally on test failure.
+func dumpLogTail(t testing.TB, name, kind, path string) {
+	t.Helper()
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Logf("=== %s %s log unavailable: %v ===", name, kind, err)
+		return
+	}
+	if len(data) == 0 {
+		t.Logf("=== %s %s log is empty ===", name, kind)
+		return
+	}
+	lines := bytes.Split(data, []byte("\n"))
+	start := 0
+	if len(lines) > 50 {
+		start = len(lines) - 50
+	}
+	t.Logf("=== last 50 lines of %s %s log ===", name, kind)
+	for _, line := range lines[start:] {
+		t.Logf("[%s] %s", name, line)
+	}
+}
+
+// hostFwdRe matches a single TCP[HOST_FORWARD] line from QEMU's
+// "info usernet" human-monitor command output, e.g.:
+//
+//	TCP[HOST_FORWARD]  12       127.0.0.1 35323       10.0.2.15    22
+var hostFwdRe = regexp.MustCompile(`TCP\[HOST_FORWARD\]\s+\d+\s+127\.0\.0\.1\s+(\d+)\s+`)
+
 // qmpQueryHostFwd connects to a QEMU QMP socket and queries the host port
 // assigned to the first TCP host forward rule (the SSH debug port).
 func qmpQueryHostFwd(sockPath string) (int, error) {
@@ -271,7 +372,7 @@ func qmpQueryHostFwd(sockPath string) (int, error) {
 		return 0, fmt.Errorf("QMP socket %s not available", sockPath)
 	}
 	defer conn.Close()
-	conn.SetDeadline(time.Now().Add(5 * time.Second))
+	conn.SetDeadline(time.Now().Add(20 * time.Second))

 	// Read the QMP greeting.
 	var greeting json.RawMessage
@@ -287,23 +388,30 @@ func qmpQueryHostFwd(sockPath string) (int, error) {
 		return 0, fmt.Errorf("reading qmp_capabilities response: %w", err)
 	}

-	// Query "info usernet" via human-monitor-command.
-	fmt.Fprintf(conn, `{"execute":"human-monitor-command","arguments":{"command-line":"info usernet"}}`+"\n")
-	var hmpResp struct {
-		Return string `json:"return"`
+	// Poll "info usernet" until the SLIRP host-forward rule appears.
+	// On slow runners (e.g. GitHub Actions) QEMU sometimes returns an
+	// empty "info usernet" if we query it before user-mode networking
+	// has finished wiring up the forward, so single-shot lookups fail.
+	deadline := time.Now().Add(10 * time.Second)
+	var lastReturn string
+	for {
+		fmt.Fprintf(conn, `{"execute":"human-monitor-command","arguments":{"command-line":"info usernet"}}`+"\n")
+		var hmpResp struct {
+			Return string `json:"return"`
+		}
+		if err := dec.Decode(&hmpResp); err != nil {
+			return 0, fmt.Errorf("reading info usernet response: %w", err)
+		}
+		lastReturn = hmpResp.Return
+		if m := hostFwdRe.FindStringSubmatch(hmpResp.Return); m != nil {
+			return strconv.Atoi(m[1])
+		}
+		if time.Now().After(deadline) {
+			break
+		}
+		time.Sleep(100 * time.Millisecond)
 	}
-	if err := dec.Decode(&hmpResp); err != nil {
-		return 0, fmt.Errorf("reading info usernet response: %w", err)
-	}
-
-	// Parse the port from output like:
-	//   TCP[HOST_FORWARD]  12       127.0.0.1 35323       10.0.2.15    22
-	re := regexp.MustCompile(`TCP\[HOST_FORWARD\]\s+\d+\s+127\.0\.0\.1\s+(\d+)\s+`)
-	m := re.FindStringSubmatch(hmpResp.Return)
-	if m == nil {
-		return 0, fmt.Errorf("no hostfwd port found in: %s", hmpResp.Return)
-	}
-	return strconv.Atoi(m[1])
+	return 0, fmt.Errorf("no hostfwd port found after waiting: %q", lastReturn)
 }

 // tailLogFile tails a VM's serial console log file and publishes each line
@@ -420,6 +420,13 @@ func (e *Env) AddNode(name string, opts ...any) *Node {
 		}
 	}

+	// macOS VMs require a macOS arm64 host (Apple Virtualization.framework via
+	// tailmac). Skip the test now rather than letting it proceed through the
+	// rest of the setup only to fail later.
+	if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") {
+		e.t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH)
+	}
+
 	n.vnetNode = e.cfg.AddNode(vnetOpts...)
 	n.num = n.vnetNode.Num()
 	return n
@@ -504,12 +511,6 @@ func (e *Env) Start() {
 		t.Logf("using Tailscale release version %s (from --test-version=%q)", v, *testVersion)
 	}

-	for _, n := range e.nodes {
-		if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") {
-			t.Skip("macOS VM tests require macOS arm64 host")
-		}
-	}
-
 	// Dry-run: let each platform register its steps with the web UI.
 	userSteps := e.steps
 	e.steps = nil
@@ -7,6 +7,7 @@ import (
 	"bytes"
 	"fmt"
 	"net/netip"
+	"runtime"
 	"strings"
 	"testing"
 	"time"
@@ -21,7 +22,20 @@ import (
 	"tailscale.com/types/netmap"
 )

+// skipIfNotMacOSArm64 skips the test when the host isn't a macOS arm64 host.
+// macOS VM tests require Apple Virtualization.framework via tailmac.
+// AddNode also enforces this when a macOS node is added, but having an
+// explicit skip at the top of macOS-only tests makes the requirement
+// obvious to readers.
+func skipIfNotMacOSArm64(t *testing.T) {
+	t.Helper()
+	if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" {
+		t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH)
+	}
+}
+
 func TestMacOSAndLinuxCanPing(t *testing.T) {
+	skipIfNotMacOSArm64(t)
 	env := vmtest.New(t)

 	lan := env.AddNetwork("192.168.1.1/24")
@@ -39,6 +53,7 @@ func TestMacOSAndLinuxCanPing(t *testing.T) {
 }

 func TestTwoMacOSVMsCanPing(t *testing.T) {
+	skipIfNotMacOSArm64(t)
 	env := vmtest.New(t)

 	lan := env.AddNetwork("192.168.1.1/24")
@@ -969,8 +984,18 @@ func TestCachedNetmapAfterRestart(t *testing.T) {
 	}
 	netmapCheckStep.End(nil)

+	// 90s is generous on purpose. After both nodes restart with stale cached
+	// netmap entries, a's first WG handshake to b's pre-restart endpoint
+	// hits the dead NAT mapping on b's side and is silently dropped (we
+	// see this as "no recent outgoing packet" NAT drops in the vnet log).
+	// Recovery then waits on wireguard-go's REKEY_TIMEOUT (~5s) before the
+	// next handshake attempt, and on disco-via-DERP to teach each side the
+	// other's new endpoint. On an idle host this converges in well under
+	// 15s; on a contended host (a 14/16-CPU-loaded local repro, or any
+	// shared CI runner) the same sequence has been observed at 50-60s
+	// because every timer fires multiple times under scheduling jitter.
 	pingStep.Begin()
-	if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil {
+	if err := env.Ping(a, b, tailcfg.PingTSMP, 90*time.Second); err != nil {
 		pingStep.End(err)
 		t.Fatal(err)
 	}
@@ -2583,14 +2583,24 @@ func (s *Server) addIdleAgentConn(ac *agentConn) {

 func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok bool) {
 	const debug = false
+	// stuckThreshold is how long we wait before deciding the agent is slow
+	// enough to warrant a log line. Below this we stay quiet because, in
+	// healthy runs with many agent dials in flight, even a few-millisecond
+	// wait would otherwise log every poll for every concurrent waiter.
+	const stuckThreshold = 10 * time.Second
+	start := time.Now()
+	var lastWarn time.Time
 	for {
-		ac, ok := s.takeAgentConnOne(n)
-		if ok {
+		ac, miss := s.takeAgentConnOne(n)
+		if ac != nil {
 			if debug {
 				log.Printf("takeAgentConn: got agent conn for %v", n.mac)
 			}
 			return ac, true
 		}
+		if debug && miss > 0 {
+			log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac)
+		}
 		s.mu.Lock()
 		ready := make(chan struct{})
 		mak.Set(&s.agentConnWaiter, n, ready)
@@ -2599,6 +2609,10 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b
 		if debug {
 			log.Printf("takeAgentConn: waiting for agent conn for %v", n.mac)
 		}
+		if elapsed := time.Since(start); elapsed > stuckThreshold && time.Since(lastWarn) > stuckThreshold {
+			log.Printf("takeAgentConn: still waiting for agent conn for %v after %v (%d idle conns for other nodes)", n.mac, elapsed.Round(time.Second), miss)
+			lastWarn = time.Now()
+		}
 		select {
 		case <-ctx.Done():
 			return nil, false
@@ -2611,21 +2625,21 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b
 	}
 }

-func (s *Server) takeAgentConnOne(n *node) (_ *agentConn, ok bool) {
+// takeAgentConnOne returns an idle agent conn for n if one is available,
+// otherwise nil. miss is the number of idle agent conns for other nodes that
+// were walked over while looking; the caller may use it for diagnostics when
+// a wait drags on.
+func (s *Server) takeAgentConnOne(n *node) (ac *agentConn, miss int) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	miss := 0
 	for ac := range s.agentConns {
 		if ac.node == n {
 			s.agentConns.Delete(ac)
-			return ac, true
+			return ac, 0
 		}
 		miss++
 	}
-	if miss > 0 {
-		log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac)
-	}
-	return nil, false
+	return nil, miss
 }

 type NodeAgentClient struct {