diff --git a/.github/workflows/natlab-integrationtest.yml b/.github/workflows/natlab-basic.yml similarity index 89% rename from .github/workflows/natlab-integrationtest.yml rename to .github/workflows/natlab-basic.yml index 6c0d49575..584da6e69 100644 --- a/.github/workflows/natlab-integrationtest.yml +++ b/.github/workflows/natlab-basic.yml @@ -1,6 +1,7 @@ -# Run some natlab integration tests. +# Run a single natlab smoke test on every PR. The full natlab suite +# is opt-in and lives in .github/workflows/natlab-test.yml. # See https://github.com/tailscale/tailscale/issues/13038 -name: "natlab-integrationtest" +name: "natlab-basic" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,7 +18,7 @@ on: branches: - "main" jobs: - natlab-integrationtest: + EasyEasy: runs-on: ubuntu-latest steps: - name: Check out code diff --git a/.github/workflows/natlab-test.yml b/.github/workflows/natlab-test.yml new file mode 100644 index 000000000..7741ef12f --- /dev/null +++ b/.github/workflows/natlab-test.yml @@ -0,0 +1,182 @@ +# Run the full natlab/vmtest opt-in test suite. These tests boot QEMU VMs +# (gokrazy, Ubuntu, FreeBSD) and exercise vnet-driven networking scenarios. +# They are gated behind --run-vm-tests because they need KVM and are slow. +# +# This workflow runs: +# - on demand (workflow_dispatch) +# - on PRs that carry the "natlab" label +# - on main, every 12 hours, via cron +# +# Layout: +# - "prepare" builds the gokrazy VM image, downloads the cloud images +# (Ubuntu, FreeBSD), and discovers every Test* function in the two +# opt-in packages. +# - "test" is a per-TestFoo matrix that depends on prepare. Each matrix +# job restores the shared caches and runs a single test. Adding a new +# TestFoo automatically gets its own job — no workflow edits needed. +# +# A separate workflow (.github/workflows/natlab-basic.yml) runs a single +# canary natlab test on every PR; this one runs the full suite. +name: "natlab-test" + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +on: + workflow_dispatch: + pull_request: + types: [labeled, synchronize, reopened] + schedule: + # Every 12 hours, off-the-hour to avoid GitHub's :00 cron-stampede window. + - cron: "23 3,15 * * *" + +jobs: + # prepare warms the per-workflow-run caches (gokrazy image, cloud VM + # images) and emits the dynamic matrix of test names. By doing the work + # once here, the matrix test jobs never race to rebuild or re-download + # the same artifacts on a cold cache. + prepare: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'natlab')) + runs-on: ubuntu-latest + timeout-minutes: 30 + outputs: + matrix: ${{ steps.list.outputs.matrix }} + steps: + - name: Check out code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # The cloud VM image cache is keyed only on images.go (image URLs and + # SHAs), so it survives across workflow runs and is invalidated only + # when a new image source is added. + - name: Cache cloud VM images + uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 + with: + path: ~/.cache/tailscale/vmtest/images + key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }} + + # The gokrazy VM image is keyed by github.sha. That means we rebuild + # it once per commit but matrix test jobs in the same run all share + # the result. Per-PR re-runs of the same sha (e.g. a rerun-failed) + # also get the cache. + - name: Cache gokrazy VM image + id: gokrazy-cache + uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 + with: + path: gokrazy/natlabapp.qcow2 + key: natlab-gokrazy-${{ github.sha }} + + # qemu-utils provides qemu-img, which the gokrazy Makefile uses to + # convert natlabapp.img to qcow2. Only install if we need it (cache + # miss); the test matrix jobs install qemu separately for the runtime. + - name: Install qemu-utils + if: steps.gokrazy-cache.outputs.cache-hit != 'true' + run: | + sudo rm -f /var/lib/man-db/auto-update + sudo apt-get -y update + sudo apt-get -y remove man-db + sudo apt-get install -y qemu-utils + + - name: Download cloud VM images + # natlabprep is idempotent: it checks the cache before downloading. + run: | + ./tool/go run ./tstest/natlab/vmtest/cmd/natlabprep + + - name: Build gokrazy VM image + if: steps.gokrazy-cache.outputs.cache-hit != 'true' + run: | + make -C gokrazy natlab + + - name: Discover tests + id: list + # Grep the test files directly rather than invoking `go test -list` + # so we don't pay the cost of compiling the test binaries here. The + # only test functions in these packages use the canonical + # `func TestFoo(t *testing.T)` signature. + # + # exclude is the set of tests that need special invocation + # (extra flags, a specific environment) and don't fit the + # single-test-per-matrix-job model. They stay runnable locally. + run: | + set -euo pipefail + exclude='^(TestGrid)$' + tmp=$(mktemp) + for pkg_dir in tstest/natlab/vmtest tstest/integration/nat; do + pkg="./${pkg_dir}/" + for f in "${pkg_dir}"/*_test.go; do + [ -e "$f" ] || continue + grep -hE '^func Test[A-Z][A-Za-z0-9_]*\(t \*testing\.T\)' "$f" \ + | sed -E 's/^func (Test[A-Za-z0-9_]+).*/\1/' \ + | grep -vE "$exclude" \ + | while read -r t; do + jq -nc --arg pkg "$pkg" --arg test "$t" \ + '{pkg: $pkg, test: $test}' >> "$tmp" + done + done + done + matrix=$(jq -s -c . "$tmp") + echo "matrix=${matrix}" >> "$GITHUB_OUTPUT" + echo "Discovered tests:" + jq . "$tmp" + + test: + needs: prepare + runs-on: ubuntu-latest + timeout-minutes: 20 + name: "${{ matrix.test }}" + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.prepare.outputs.matrix) }} + steps: + - name: Check out code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Enable KVM + run: | + echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules + sudo udevadm control --reload-rules + sudo udevadm trigger --name-match=kvm + + - name: Install qemu + run: | + sudo rm -f /var/lib/man-db/auto-update + sudo apt-get -y update + sudo apt-get -y remove man-db + sudo apt-get install -y qemu-system-x86 qemu-utils + + # restore-only: prepare is the single writer of these caches, so + # matrix jobs don't write back. fail-on-cache-miss would be too + # strict for the gokrazy cache (e.g. a non-fatal cache eviction + # between prepare and us); we just rebuild on miss instead. + - name: Restore cloud VM images + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 + with: + path: ~/.cache/tailscale/vmtest/images + key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }} + + - name: Restore gokrazy VM image + uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 + with: + path: gokrazy/natlabapp.qcow2 + key: natlab-gokrazy-${{ github.sha }} + + # The gokrazy-based tests boot the kernel directly from + # vmlinuz that ships in the tailscale/gokrazy-kernel module. + # Tests look it up under GOMODCACHE via findKernelPath, so the + # module has to be present even though no Go source imports it + # in the test package itself. + - name: Download gokrazy-kernel module + run: | + ./tool/go mod download github.com/tailscale/gokrazy-kernel + + - name: Run ${{ matrix.test }} + # Per-test timeout is well above the few-minute typical runtime + # but small enough that a stuck test fails fast instead of holding + # the runner for the job's 20-minute budget. + run: | + ./tool/go test -v -timeout=15m -count=1 ${{ matrix.pkg }} \ + -run='^${{ matrix.test }}$' --run-vm-tests diff --git a/tstest/natlab/vmtest/cloudinit.go b/tstest/natlab/vmtest/cloudinit.go index a00f849ba..f0ef704fe 100644 --- a/tstest/natlab/vmtest/cloudinit.go +++ b/tstest/natlab/vmtest/cloudinit.go @@ -173,16 +173,21 @@ func (e *Env) generateFreeBSDUserData(n *Node) string { ud.WriteString(" - \"sysctl net.inet6.ip6.forwarding=1\"\n") } - // Start tailscaled and tta in the background. - // Set PATH to include /usr/local/bin so that tta can find "tailscale" - // (TTA uses exec.Command("tailscale", ...) without a full path). - // --statedir provides a VarRoot so features like Taildrop have a directory. + // Start tailscaled and tta in the background. Redirect stdio to log + // files and away from /dev/null on stdin; otherwise nuageinit's runcmd + // executor keeps the backgrounded child's stdout/stderr pipes open and + // blocks waiting for them, so subsequent runcmd entries (including the + // tta launch below) never run. Linux cloud-init doesn't have this + // gotcha. Set PATH to include /usr/local/bin so that tta can find + // "tailscale" (TTA uses exec.Command("tailscale", ...) without a full + // path). --statedir provides a VarRoot so features like Taildrop have a + // directory. ud.WriteString(" - \"mkdir -p /var/lib/tailscale\"\n") - ud.WriteString(" - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tailscaled --state=mem: --statedir=/var/lib/tailscale &\"\n") + ud.WriteString(" - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tailscaled --state=mem: --statedir=/var/lib/tailscale /var/log/tailscaled.log 2>&1 &\"\n") ud.WriteString(" - \"sleep 2\"\n") - // Start tta (Tailscale Test Agent). - ud.WriteString(" - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tta &\"\n") + // Start tta (Tailscale Test Agent), with the same stdio redirection. + ud.WriteString(" - \"export PATH=/usr/local/bin:$PATH && /usr/local/bin/tta /var/log/tta.log 2>&1 &\"\n") return ud.String() } diff --git a/tstest/natlab/vmtest/cmd/natlabprep/natlabprep.go b/tstest/natlab/vmtest/cmd/natlabprep/natlabprep.go new file mode 100644 index 000000000..d60b878e3 --- /dev/null +++ b/tstest/natlab/vmtest/cmd/natlabprep/natlabprep.go @@ -0,0 +1,24 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// The natlabprep tool warms the local natlab vmtest cache by downloading +// every cloud VM image natlab can boot. It is intended for CI prep steps +// so a subsequent test run does not pay the per-image download cost. +package main + +import ( + "context" + "log" + + "tailscale.com/tstest/natlab/vmtest" +) + +func main() { + ctx := context.Background() + for _, img := range vmtest.CloudImages() { + log.Printf("ensuring %s ...", img.Name) + if err := vmtest.EnsureImage(ctx, img); err != nil { + log.Fatalf("ensuring %s: %v", img.Name, err) + } + } +} diff --git a/tstest/natlab/vmtest/images.go b/tstest/natlab/vmtest/images.go index bce5452a4..e01bf1d71 100644 --- a/tstest/natlab/vmtest/images.go +++ b/tstest/natlab/vmtest/images.go @@ -91,6 +91,22 @@ var ( } ) +// CloudImages returns the set of QEMU-bootable cloud OS images natlab can +// use for vmtests, excluding gokrazy (built from source) and macOS (which +// uses a separate snapshot pipeline). It is intended for tooling such as +// a CI prep step that wants to warm the image cache. +func CloudImages() []OSImage { + return []OSImage{Ubuntu2404, Debian12, FreeBSD150} +} + +// EnsureImage downloads img to the local cache if not already present. +// It is intended for tooling that wants to warm the image cache before +// running natlab vmtests (e.g. a CI prep step). The test framework also +// calls into the package-internal equivalent on demand. +func EnsureImage(ctx context.Context, img OSImage) error { + return ensureImage(ctx, img) +} + // imageCacheDir returns the directory for cached VM images. func imageCacheDir() string { if d := os.Getenv("VMTEST_CACHE_DIR"); d != "" { diff --git a/tstest/natlab/vmtest/qemu.go b/tstest/natlab/vmtest/qemu.go index 1486a9ef8..177c8bca9 100644 --- a/tstest/natlab/vmtest/qemu.go +++ b/tstest/natlab/vmtest/qemu.go @@ -15,6 +15,7 @@ import ( "regexp" "strconv" "strings" + "testing" "time" "tailscale.com/tstest/natlab/vnet" @@ -192,22 +193,98 @@ func (e *Env) startCloudQEMU(n *Node) error { return nil } -// launchQEMU starts a qemu-system-x86_64 process with the given args. +// qemuRun is one running qemu-system-x86_64 process plus the file handles +// the wrapping code holds open on its behalf. kill tears the whole thing +// down (used both for normal cleanup and for the in-flight retry path). +type qemuRun struct { + cmd *exec.Cmd + parentPipe *os.File + devNull *os.File + qemuLog *os.File +} + +func (r *qemuRun) kill() { + killProcessTree(r.cmd) + r.cmd.Wait() + r.parentPipe.Close() + r.devNull.Close() + r.qemuLog.Close() +} + +// launchQEMU starts a qemu-system-x86_64 process with the given args and +// watches for console activity. If the guest produces no output within +// stuckTimeout (empty console *and* QEMU has not exited with an error), +// the QEMU process is killed and re-launched. This works around CI +// hypervisor flakes seen on shared GitHub Actions runners where a QEMU +// process starts but its vCPU never makes any forward progress (the +// failure presents as both the virtconsole log and the QEMU stderr log +// being zero bytes after many minutes, with the vnet stream socket +// connected but no packet ever sent). +// // VM console output goes to logPath (via QEMU's -serial or -chardev). // QEMU's own stdout/stderr go to logPath.qemu for diagnostics. func (e *Env) launchQEMU(name, logPath string, args []string) error { + // stuckTimeout is generous: a healthy VM prints SeaBIOS/kernel + // output within ~1-2s on KVM, but slow shared CI hardware can lag. + // Setting it too low risks killing a healthy-but-slow VM; setting it + // too high masks the wedge case we want to recover from. + const stuckTimeout = 45 * time.Second + const maxAttempts = 3 + + var lastErr error + for attempt := 1; attempt <= maxAttempts; attempt++ { + if attempt > 1 { + e.t.Logf("[%s] QEMU made no progress in %v; killing and retrying (attempt %d/%d)", name, stuckTimeout, attempt, maxAttempts) + // QEMU's -chardev file backend opens append-mode, so stale + // bytes from a previous attempt would falsely trip the + // progress check on retry. Truncate it. + os.Truncate(logPath, 0) + } + run, err := e.startQEMUOnce(name, logPath, args) + if err != nil { + lastErr = err + continue + } + if waitForConsoleProgress(logPath, stuckTimeout) { + e.qemuProcs = append(e.qemuProcs, run.cmd) + if e.ctx != nil { + go e.tailLogFile(e.ctx, name, logPath) + } + e.t.Cleanup(func() { + run.kill() + // Dump tail of VM log and QEMU's own stderr on failure. + // The console log (logPath) is empty when the guest never + // produced output (e.g. QEMU exited before the kernel ran); + // in that case the .qemu file holds the only diagnostic — + // KVM errors, "kvm not available", CPU model mismatch, etc. + if e.t.Failed() { + dumpLogTail(e.t, name, "console", logPath) + dumpLogTail(e.t, name, "qemu stderr", logPath+".qemu") + } + }) + return nil + } + lastErr = fmt.Errorf("QEMU for %s produced no console output in %v", name, stuckTimeout) + run.kill() + } + return fmt.Errorf("QEMU for %s failed after %d attempts: %w", name, maxAttempts, lastErr) +} + +// startQEMUOnce starts a single qemu-system-x86_64 process. On success the +// returned qemuRun owns the process and all file handles; the caller must +// invoke kill (either inline for a retry or via t.Cleanup for the +// surviving attempt). +func (e *Env) startQEMUOnce(name, logPath string, args []string) (*qemuRun, error) { cmd := exec.Command("qemu-system-x86_64", args...) - // Send stdout/stderr to the log file for any QEMU diagnostic messages. - // Stdin must be /dev/null to prevent QEMU from trying to read. devNull, err := os.Open(os.DevNull) if err != nil { - return fmt.Errorf("open /dev/null: %w", err) + return nil, fmt.Errorf("open /dev/null: %w", err) } cmd.Stdin = devNull qemuLog, err := os.Create(logPath + ".qemu") if err != nil { devNull.Close() - return err + return nil, err } cmd.Stdout = qemuLog cmd.Stderr = qemuLog @@ -215,45 +292,69 @@ func (e *Env) launchQEMU(name, logPath string, args []string) error { if err != nil { devNull.Close() qemuLog.Close() - return fmt.Errorf("killWithParent: %w", err) + return nil, fmt.Errorf("killWithParent: %w", err) } if err := cmd.Start(); err != nil { parentPipe.Close() devNull.Close() qemuLog.Close() - return fmt.Errorf("qemu for %s: %w", name, err) + return nil, fmt.Errorf("qemu for %s: %w", name, err) } e.t.Logf("launched QEMU for %s (pid %d), log: %s", name, cmd.Process.Pid, logPath) - e.qemuProcs = append(e.qemuProcs, cmd) - - // Start tailing the VM console log for the web UI. - if e.ctx != nil { - go e.tailLogFile(e.ctx, name, logPath) - } - e.t.Cleanup(func() { - killProcessTree(cmd) - cmd.Wait() - parentPipe.Close() - devNull.Close() - qemuLog.Close() - // Dump tail of VM log on failure for debugging. - if e.t.Failed() { - if data, err := os.ReadFile(logPath); err == nil { - lines := bytes.Split(data, []byte("\n")) - start := 0 - if len(lines) > 50 { - start = len(lines) - 50 - } - e.t.Logf("=== last 50 lines of %s log ===", name) - for _, line := range lines[start:] { - e.t.Logf("[%s] %s", name, line) - } - } - } - }) - return nil + return &qemuRun{ + cmd: cmd, + parentPipe: parentPipe, + devNull: devNull, + qemuLog: qemuLog, + }, nil } +// waitForConsoleProgress polls logPath until its size is non-zero or +// timeout elapses. It returns true on observed forward progress (any +// bytes written), false on timeout. +func waitForConsoleProgress(logPath string, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if fi, err := os.Stat(logPath); err == nil && fi.Size() > 0 { + return true + } + time.Sleep(200 * time.Millisecond) + } + return false +} + +// dumpLogTail prints the last 50 lines of the file at path to the test log, +// prefixed with the VM name and kind (e.g. "console", "qemu stderr"). It is +// a no-op (with a short note) if the file can't be read or is empty, so +// callers can use it unconditionally on test failure. +func dumpLogTail(t testing.TB, name, kind, path string) { + t.Helper() + data, err := os.ReadFile(path) + if err != nil { + t.Logf("=== %s %s log unavailable: %v ===", name, kind, err) + return + } + if len(data) == 0 { + t.Logf("=== %s %s log is empty ===", name, kind) + return + } + lines := bytes.Split(data, []byte("\n")) + start := 0 + if len(lines) > 50 { + start = len(lines) - 50 + } + t.Logf("=== last 50 lines of %s %s log ===", name, kind) + for _, line := range lines[start:] { + t.Logf("[%s] %s", name, line) + } +} + +// hostFwdRe matches a single TCP[HOST_FORWARD] line from QEMU's +// "info usernet" human-monitor command output, e.g.: +// +// TCP[HOST_FORWARD] 12 127.0.0.1 35323 10.0.2.15 22 +var hostFwdRe = regexp.MustCompile(`TCP\[HOST_FORWARD\]\s+\d+\s+127\.0\.0\.1\s+(\d+)\s+`) + // qmpQueryHostFwd connects to a QEMU QMP socket and queries the host port // assigned to the first TCP host forward rule (the SSH debug port). func qmpQueryHostFwd(sockPath string) (int, error) { @@ -271,7 +372,7 @@ func qmpQueryHostFwd(sockPath string) (int, error) { return 0, fmt.Errorf("QMP socket %s not available", sockPath) } defer conn.Close() - conn.SetDeadline(time.Now().Add(5 * time.Second)) + conn.SetDeadline(time.Now().Add(20 * time.Second)) // Read the QMP greeting. var greeting json.RawMessage @@ -287,23 +388,30 @@ func qmpQueryHostFwd(sockPath string) (int, error) { return 0, fmt.Errorf("reading qmp_capabilities response: %w", err) } - // Query "info usernet" via human-monitor-command. - fmt.Fprintf(conn, `{"execute":"human-monitor-command","arguments":{"command-line":"info usernet"}}`+"\n") - var hmpResp struct { - Return string `json:"return"` + // Poll "info usernet" until the SLIRP host-forward rule appears. + // On slow runners (e.g. GitHub Actions) QEMU sometimes returns an + // empty "info usernet" if we query it before user-mode networking + // has finished wiring up the forward, so single-shot lookups fail. + deadline := time.Now().Add(10 * time.Second) + var lastReturn string + for { + fmt.Fprintf(conn, `{"execute":"human-monitor-command","arguments":{"command-line":"info usernet"}}`+"\n") + var hmpResp struct { + Return string `json:"return"` + } + if err := dec.Decode(&hmpResp); err != nil { + return 0, fmt.Errorf("reading info usernet response: %w", err) + } + lastReturn = hmpResp.Return + if m := hostFwdRe.FindStringSubmatch(hmpResp.Return); m != nil { + return strconv.Atoi(m[1]) + } + if time.Now().After(deadline) { + break + } + time.Sleep(100 * time.Millisecond) } - if err := dec.Decode(&hmpResp); err != nil { - return 0, fmt.Errorf("reading info usernet response: %w", err) - } - - // Parse the port from output like: - // TCP[HOST_FORWARD] 12 127.0.0.1 35323 10.0.2.15 22 - re := regexp.MustCompile(`TCP\[HOST_FORWARD\]\s+\d+\s+127\.0\.0\.1\s+(\d+)\s+`) - m := re.FindStringSubmatch(hmpResp.Return) - if m == nil { - return 0, fmt.Errorf("no hostfwd port found in: %s", hmpResp.Return) - } - return strconv.Atoi(m[1]) + return 0, fmt.Errorf("no hostfwd port found after waiting: %q", lastReturn) } // tailLogFile tails a VM's serial console log file and publishes each line diff --git a/tstest/natlab/vmtest/vmtest.go b/tstest/natlab/vmtest/vmtest.go index df3cca81a..8dbf720f6 100644 --- a/tstest/natlab/vmtest/vmtest.go +++ b/tstest/natlab/vmtest/vmtest.go @@ -420,6 +420,13 @@ func (e *Env) AddNode(name string, opts ...any) *Node { } } + // macOS VMs require a macOS arm64 host (Apple Virtualization.framework via + // tailmac). Skip the test now rather than letting it proceed through the + // rest of the setup only to fail later. + if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") { + e.t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH) + } + n.vnetNode = e.cfg.AddNode(vnetOpts...) n.num = n.vnetNode.Num() return n @@ -504,12 +511,6 @@ func (e *Env) Start() { t.Logf("using Tailscale release version %s (from --test-version=%q)", v, *testVersion) } - for _, n := range e.nodes { - if n.os.IsMacOS && (runtime.GOOS != "darwin" || runtime.GOARCH != "arm64") { - t.Skip("macOS VM tests require macOS arm64 host") - } - } - // Dry-run: let each platform register its steps with the web UI. userSteps := e.steps e.steps = nil diff --git a/tstest/natlab/vmtest/vmtest_test.go b/tstest/natlab/vmtest/vmtest_test.go index 9390c2267..fe8d00bef 100644 --- a/tstest/natlab/vmtest/vmtest_test.go +++ b/tstest/natlab/vmtest/vmtest_test.go @@ -7,6 +7,7 @@ import ( "bytes" "fmt" "net/netip" + "runtime" "strings" "testing" "time" @@ -21,7 +22,20 @@ import ( "tailscale.com/types/netmap" ) +// skipIfNotMacOSArm64 skips the test when the host isn't a macOS arm64 host. +// macOS VM tests require Apple Virtualization.framework via tailmac. +// AddNode also enforces this when a macOS node is added, but having an +// explicit skip at the top of macOS-only tests makes the requirement +// obvious to readers. +func skipIfNotMacOSArm64(t *testing.T) { + t.Helper() + if runtime.GOOS != "darwin" || runtime.GOARCH != "arm64" { + t.Skipf("macOS VM tests require a macOS arm64 host (got %s/%s)", runtime.GOOS, runtime.GOARCH) + } +} + func TestMacOSAndLinuxCanPing(t *testing.T) { + skipIfNotMacOSArm64(t) env := vmtest.New(t) lan := env.AddNetwork("192.168.1.1/24") @@ -39,6 +53,7 @@ func TestMacOSAndLinuxCanPing(t *testing.T) { } func TestTwoMacOSVMsCanPing(t *testing.T) { + skipIfNotMacOSArm64(t) env := vmtest.New(t) lan := env.AddNetwork("192.168.1.1/24") @@ -969,8 +984,18 @@ func TestCachedNetmapAfterRestart(t *testing.T) { } netmapCheckStep.End(nil) + // 90s is generous on purpose. After both nodes restart with stale cached + // netmap entries, a's first WG handshake to b's pre-restart endpoint + // hits the dead NAT mapping on b's side and is silently dropped (we + // see this as "no recent outgoing packet" NAT drops in the vnet log). + // Recovery then waits on wireguard-go's REKEY_TIMEOUT (~5s) before the + // next handshake attempt, and on disco-via-DERP to teach each side the + // other's new endpoint. On an idle host this converges in well under + // 15s; on a contended host (a 14/16-CPU-loaded local repro, or any + // shared CI runner) the same sequence has been observed at 50-60s + // because every timer fires multiple times under scheduling jitter. pingStep.Begin() - if err := env.Ping(a, b, tailcfg.PingTSMP, 30*time.Second); err != nil { + if err := env.Ping(a, b, tailcfg.PingTSMP, 90*time.Second); err != nil { pingStep.End(err) t.Fatal(err) } diff --git a/tstest/natlab/vnet/vnet.go b/tstest/natlab/vnet/vnet.go index 06c382aef..958da04de 100644 --- a/tstest/natlab/vnet/vnet.go +++ b/tstest/natlab/vnet/vnet.go @@ -2583,14 +2583,24 @@ func (s *Server) addIdleAgentConn(ac *agentConn) { func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok bool) { const debug = false + // stuckThreshold is how long we wait before deciding the agent is slow + // enough to warrant a log line. Below this we stay quiet because, in + // healthy runs with many agent dials in flight, even a few-millisecond + // wait would otherwise log every poll for every concurrent waiter. + const stuckThreshold = 10 * time.Second + start := time.Now() + var lastWarn time.Time for { - ac, ok := s.takeAgentConnOne(n) - if ok { + ac, miss := s.takeAgentConnOne(n) + if ac != nil { if debug { log.Printf("takeAgentConn: got agent conn for %v", n.mac) } return ac, true } + if debug && miss > 0 { + log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac) + } s.mu.Lock() ready := make(chan struct{}) mak.Set(&s.agentConnWaiter, n, ready) @@ -2599,6 +2609,10 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b if debug { log.Printf("takeAgentConn: waiting for agent conn for %v", n.mac) } + if elapsed := time.Since(start); elapsed > stuckThreshold && time.Since(lastWarn) > stuckThreshold { + log.Printf("takeAgentConn: still waiting for agent conn for %v after %v (%d idle conns for other nodes)", n.mac, elapsed.Round(time.Second), miss) + lastWarn = time.Now() + } select { case <-ctx.Done(): return nil, false @@ -2611,21 +2625,21 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b } } -func (s *Server) takeAgentConnOne(n *node) (_ *agentConn, ok bool) { +// takeAgentConnOne returns an idle agent conn for n if one is available, +// otherwise nil. miss is the number of idle agent conns for other nodes that +// were walked over while looking; the caller may use it for diagnostics when +// a wait drags on. +func (s *Server) takeAgentConnOne(n *node) (ac *agentConn, miss int) { s.mu.Lock() defer s.mu.Unlock() - miss := 0 for ac := range s.agentConns { if ac.node == n { s.agentConns.Delete(ac) - return ac, true + return ac, 0 } miss++ } - if miss > 0 { - log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac) - } - return nil, false + return nil, miss } type NodeAgentClient struct {