From 7dcb378875bff846904d4c5c3252debc88900683 Mon Sep 17 00:00:00 2001
From: Brad Fitzpatrick <bradfitz@tailscale.com>
Date: Mon, 13 Apr 2026 23:20:56 +0000
Subject: [PATCH] tstest/integration/nat, tstest/natlab/vnet: fix natlab test
 flake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The natlab-integrationtest CI job frequently flakes by exhausting its
3m go test timeout. The root cause is that the QEMU VMs run under
pure software emulation (TCG) with no KVM. Under TCG, the guest
kernel's timer calibration busy-loops are at the mercy of host CPU
scheduling. When two VMs boot simultaneously on a 2-core CI runner,
one VM's calibration gets starved and produces wrong results, leaving
the kernel with broken timers that prevent it from ever completing
boot — even after the other VM finishes and frees up CPU.

Additionally, the microvm machine type doesn't provide HPET hardware,
but the kernel command line specified clocksource=hpet. And the VM
image build (make natlab) ran inside the test itself, consuming most
of the 3m timeout budget before the actual test started.

Fix by:

 - Enabling KVM when /dev/kvm is available, so timer calibration
   uses real hardware timers unaffected by host CPU scheduling.

 - Adding a CI step to set /dev/kvm permissions on the GitHub
   Actions runner (ubuntu-latest provides KVM but needs a udev rule).

 - Pre-building the VM image in a separate CI step so it doesn't
   cut into the go test -timeout budget.

 - Replacing the hardcoded 60s context timeout with one derived from
   t.Deadline(), so the test uses the full -timeout budget.

 - Adding VM boot progress detection (AwaitFirstPacket) and QMP
   diagnostics, so boot failures produce clear errors instead of
   opaque "context deadline exceeded" messages.

With KVM enabled, the test passes reliably even on a single CPU core
with 3 parallel workers — a scenario that was 100% broken under TCG.

Fixes #18906

Change-Id: I4c87631a9c9678d185b9f30cb05c0f7bfa9f5c62
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
---
 .github/workflows/natlab-integrationtest.yml |  11 ++
 tstest/integration/nat/nat_test.go           | 107 +++++++++++++++++--
 tstest/natlab/vnet/vnet.go                   |  28 +++++
 3 files changed, 138 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/natlab-integrationtest.yml b/.github/workflows/natlab-integrationtest.yml
index 162153cb2..6c0d49575 100644
--- a/.github/workflows/natlab-integrationtest.yml
+++ b/.github/workflows/natlab-integrationtest.yml
@@ -22,12 +22,23 @@ jobs:
     steps:
       - name: Check out code
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - name: Enable KVM
+        run: |
+          echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules
+          sudo udevadm control --reload-rules
+          sudo udevadm trigger --name-match=kvm
       - name: Install qemu
         run: |
           sudo rm -f /var/lib/man-db/auto-update
           sudo apt-get -y update
           sudo apt-get -y remove man-db
           sudo apt-get install -y qemu-system-x86 qemu-utils
+      - name: Build VM image
+        # The test will build this if missing, but we do it explicitly
+        # to avoid cutting into the go test -timeout budget, and to
+        # fail earlier with a clearer error if the image build breaks.
+        run: |
+          make -C gokrazy natlab
       - name: Run natlab integration tests
         run: |
           ./tool/go test -v -run=^TestEasyEasy$ -timeout=3m -count=1 ./tstest/integration/nat  --run-vm-tests
diff --git a/tstest/integration/nat/nat_test.go b/tstest/integration/nat/nat_test.go
index 98116f173..926c9eff4 100644
--- a/tstest/integration/nat/nat_test.go
+++ b/tstest/integration/nat/nat_test.go
@@ -7,6 +7,7 @@ import (
 	"bytes"
 	"cmp"
 	"context"
+	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
@@ -17,6 +18,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -327,6 +329,15 @@ func (nt *natTest) setupTest(ctx context.Context, addNode ...addNodeFunc) (nodes
 		}
 	})
 
+	haveKVM := false
+	if runtime.GOOS == "linux" {
+		if f, err := os.OpenFile("/dev/kvm", os.O_RDWR, 0); err == nil {
+			f.Close()
+			haveKVM = true
+		}
+	}
+
+	qmpSocks := make([]string, len(nodes))
 	for i, node := range nodes {
 		disk := fmt.Sprintf("%s/node-%d.qcow2", nt.tempDir, i)
 		out, err := exec.Command("qemu-img", "create",
@@ -349,22 +360,28 @@ func (nt *natTest) setupTest(ctx context.Context, addNode ...addNodeFunc) (nodes
 		}
 		envStr := envBuf.String()
 
-		cmd := exec.Command("qemu-system-x86_64",
+		qmpSocks[i] = fmt.Sprintf("%s/qmp-node-%d.sock", nt.tempDir, i)
+		qemuArgs := []string{
 			"-M", "microvm,isa-serial=off",
 			"-m", "384M",
 			"-nodefaults", "-no-user-config", "-nographic",
 			"-kernel", nt.kernel,
-			"-append", "console=hvc0 root=PARTUUID=60c24cc1-f3f9-427a-8199-76baa2d60001/PARTNROFF=1 ro init=/gokrazy/init panic=10 oops=panic pci=off nousb tsc=unstable clocksource=hpet gokrazy.remote_syslog.target="+sysLogAddr+" tailscale-tta=1"+envStr,
-			"-drive", "id=blk0,file="+disk+",format=qcow2",
+			"-append", "console=hvc0 root=PARTUUID=60c24cc1-f3f9-427a-8199-76baa2d60001/PARTNROFF=1 ro init=/gokrazy/init panic=10 oops=panic pci=off nousb gokrazy.remote_syslog.target=" + sysLogAddr + " tailscale-tta=1" + envStr,
+			"-drive", "id=blk0,file=" + disk + ",format=qcow2",
 			"-device", "virtio-blk-device,drive=blk0",
-			"-netdev", "stream,id=net0,addr.type=unix,addr.path="+sockAddr,
+			"-netdev", "stream,id=net0,addr.type=unix,addr.path=" + sockAddr,
 			"-device", "virtio-serial-device",
 			"-device", "virtio-rng-device",
-			"-device", "virtio-net-device,netdev=net0,mac="+node.MAC().String(),
+			"-device", "virtio-net-device,netdev=net0,mac=" + node.MAC().String(),
 			"-chardev", "stdio,id=virtiocon0,mux=on",
 			"-device", "virtconsole,chardev=virtiocon0",
 			"-mon", "chardev=virtiocon0,mode=readline",
-		)
+			"-qmp", "unix:" + qmpSocks[i] + ",server=on,wait=off",
+		}
+		if haveKVM {
+			qemuArgs = append(qemuArgs, "-enable-kvm", "-cpu", "host")
+		}
+		cmd := exec.Command("qemu-system-x86_64", qemuArgs...)
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = os.Stderr
 		if err := cmd.Start(); err != nil {
@@ -376,6 +393,15 @@ func (nt *natTest) setupTest(ctx context.Context, addNode ...addNodeFunc) (nodes
 		})
 	}
 
+	for i, node := range nodes {
+		if err := nt.vnet.AwaitFirstPacket(ctx, node.MAC()); err != nil {
+			t.Logf("node %v: no boot progress (no packets received): %v", node, err)
+			t.Logf("node %v: QMP status: %s", node, qmpQueryStatus(qmpSocks[i]))
+			t.FailNow()
+		}
+		t.Logf("node %v: boot detected (first packet received)", node)
+	}
+
 	for _, n := range nodes {
 		client := nt.vnet.NodeAgentClient(n)
 		n.SetClient(client)
@@ -430,8 +456,24 @@ func (nt *natTest) setupTest(ctx context.Context, addNode ...addNodeFunc) (nodes
 	return nodes, clients, nt.vnet.Close
 }
 
+type hasDeadline interface {
+	Deadline() (deadline time.Time, ok bool)
+}
+
+// testContext returns a context derived from the test's deadline (from -timeout),
+// leaving a small margin for cleanup. Falls back to 60s if no deadline is set.
+func testContext(tb testing.TB) (context.Context, context.CancelFunc) {
+	if t, ok := tb.(hasDeadline); ok {
+		if dl, ok := t.Deadline(); ok {
+			const margin = 5 * time.Second
+			return context.WithDeadline(context.Background(), dl.Add(-margin))
+		}
+	}
+	return context.WithTimeout(context.Background(), 60*time.Second)
+}
+
 func (nt *natTest) runHostConnectivityTest(addNode ...addNodeFunc) bool {
-	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+	ctx, cancel := testContext(nt.tb)
 	defer cancel()
 	nodes, clients, cleanup := nt.setupTest(ctx, addNode...)
 	defer cleanup()
@@ -457,7 +499,7 @@ func (nt *natTest) runHostConnectivityTest(addNode ...addNodeFunc) bool {
 }
 
 func (nt *natTest) runTailscaleConnectivityTest(addNode ...addNodeFunc) pingRoute {
-	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+	ctx, cancel := testContext(nt.tb)
 	defer cancel()
 
 	nodes, clients, cleanup := nt.setupTest(ctx, addNode...)
@@ -580,6 +622,55 @@ func ping(ctx context.Context, t testing.TB, c *vnet.NodeAgentClient, target net
 	return nil, fmt.Errorf("no ping response (ctx: %v)", ctx.Err())
 }
 
+// qmpQueryStatus connects to a QEMU QMP socket and returns the VM status
+// (e.g. "running", "paused", "prelaunch") or an error string.
+func qmpQueryStatus(sockPath string) string {
+	conn, err := net.DialTimeout("unix", sockPath, 2*time.Second)
+	if err != nil {
+		return fmt.Sprintf("dial error: %v", err)
+	}
+	defer conn.Close()
+	conn.SetDeadline(time.Now().Add(5 * time.Second))
+	dec := json.NewDecoder(conn)
+
+	// Read QMP greeting.
+	var greeting json.RawMessage
+	if err := dec.Decode(&greeting); err != nil {
+		return fmt.Sprintf("greeting error: %v", err)
+	}
+
+	// Enter command mode.
+	if _, err := conn.Write([]byte(`{"execute":"qmp_capabilities"}` + "\n")); err != nil {
+		return fmt.Sprintf("write caps: %v", err)
+	}
+	var capsResp json.RawMessage
+	if err := dec.Decode(&capsResp); err != nil {
+		return fmt.Sprintf("caps response: %v", err)
+	}
+
+	// Query status.
+	if _, err := conn.Write([]byte(`{"execute":"query-status"}` + "\n")); err != nil {
+		return fmt.Sprintf("write query-status: %v", err)
+	}
+	var statusResp struct {
+		Return struct {
+			Running bool   `json:"running"`
+			Status  string `json:"status"`
+		} `json:"return"`
+		Error *struct {
+			Class string `json:"class"`
+			Desc  string `json:"desc"`
+		} `json:"error"`
+	}
+	if err := dec.Decode(&statusResp); err != nil {
+		return fmt.Sprintf("status response: %v", err)
+	}
+	if statusResp.Error != nil {
+		return fmt.Sprintf("qmp error: %s: %s", statusResp.Error.Class, statusResp.Error.Desc)
+	}
+	return fmt.Sprintf("status=%s running=%v", statusResp.Return.Status, statusResp.Return.Running)
+}
+
 func up(ctx context.Context, c *vnet.NodeAgentClient) error {
 	req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/up", nil)
 	if err != nil {
diff --git a/tstest/natlab/vnet/vnet.go b/tstest/natlab/vnet/vnet.go
index 43256dafe..f7ecdb828 100644
--- a/tstest/natlab/vnet/vnet.go
+++ b/tstest/natlab/vnet/vnet.go
@@ -747,6 +747,7 @@ type Server struct {
 	agentConnWaiter map[*node]chan<- struct{} // signaled after added to set
 	agentConns      set.Set[*agentConn]       //  not keyed by node; should be small/cheap enough to scan all
 	agentDialer     map[*node]netx.DialFunc
+	gotFirstPacket  map[MAC]chan struct{} // closed on first packet from each MAC
 
 	cloudInitData map[int]*CloudInitData // node num → cloud-init config
 	fileContents  map[string][]byte      // filename → file bytes
@@ -825,6 +826,10 @@ func New(c *Config) (*Server, error) {
 	if err := s.initFromConfig(c); err != nil {
 		return nil, err
 	}
+	s.gotFirstPacket = make(map[MAC]chan struct{})
+	for mac := range s.nodeByMAC {
+		s.gotFirstPacket[mac] = make(chan struct{})
+	}
 	for n := range s.networks {
 		if err := n.initStack(); err != nil {
 			return nil, fmt.Errorf("newServer: initStack: %v", err)
@@ -932,6 +937,22 @@ func (s *Server) Close() {
 	s.wg.Wait()
 }
 
+// AwaitFirstPacket waits until the first ethernet frame is received from the
+// given MAC address, indicating the VM has booted far enough to send network
+// traffic. It returns an error if the context expires first.
+func (s *Server) AwaitFirstPacket(ctx context.Context, mac MAC) error {
+	ch, ok := s.gotFirstPacket[mac]
+	if !ok {
+		return fmt.Errorf("unknown MAC %v", mac)
+	}
+	select {
+	case <-ch:
+		return nil
+	case <-ctx.Done():
+		return fmt.Errorf("no network packets received from %v: %w", mac, ctx.Err())
+	}
+}
+
 // MACs returns the MAC addresses of the configured nodes.
 func (s *Server) MACs() iter.Seq[MAC] {
 	return maps.Keys(s.nodeByMAC)
@@ -1088,6 +1109,13 @@ func (s *Server) ServeUnixConn(uc *net.UnixConn, proto Protocol) {
 		}
 		if !didReg[srcMAC] {
 			didReg[srcMAC] = true
+			if ch, ok := s.gotFirstPacket[srcMAC]; ok {
+				select {
+				case <-ch: // already closed
+				default:
+					close(ch)
+				}
+			}
 			srcNet := srcNode.netForMAC(srcMAC)
 			if srcNet == nil {
 				s.logf("[conn %p] node %v has no network for MAC %v", c.uc, srcNode, srcMAC)