tstest/natlab/vmtest: add web UI for watching VM tests live

Add an optional --vmtest-web flag that starts an HTTP server showing a
live dashboard for vmtest runs. The dashboard includes:

- Step progress tracker showing all test phases (compile, image prep,
  QEMU launch, agent connect, tailscale up, test-specific steps)
  with status icons and elapsed times
- Per-VM "virtual monitor" cards showing serial console output
  streamed in realtime via WebSocket
- Per-NIC DHCP status (supporting multi-homed VMs like subnet routers)
- Per-node Tailscale status (hidden for non-tailnet VMs)
- Test status badge (Running/Passed/Failed) with live elapsed timer
- Event log showing all lifecycle events chronologically

Architecture follows the existing util/eventbus HTMX+WebSocket pattern:
the server pushes HTML fragments with hx-swap-oob attributes over a
WebSocket, and HTMX routes them to the correct DOM elements by ID.

Key components:
- vmstatus.go: Step tracker (Begin/End lifecycle), EventBus (pub/sub
  with history for late joiners), VMEvent types, NodeStatus tracking
- web.go: HTTP server, WebSocket handler, template loading, ANSI-to-HTML
  conversion via robert-nix/ansihtml, deterministic port selection
- assets/: HTML templates, CSS, HTMX library (copied from eventbus)
- vnet/vnet.go: DHCP event callback on Server for observing DHCP lifecycle
- qemu.go: Console log file tailing with manual offset-based reading

Usage:
  go test ./tstest/natlab/vmtest/ --run-vm-tests --vmtest-web=:0 -v

When using :0, a deterministic port based on the test name is tried
first so re-runs get the same URL, falling back to OS-assigned on
conflict.

Updates #13038

Change-Id: I45281347b3d7af78ed9f4ff896033984f84dcb4d
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick
2026-04-11 04:33:48 +00:00
committed by Brad Fitzpatrick
parent 0ac09721df
commit b9eac14ef9
14 changed files with 1322 additions and 39 deletions
+303 -13
View File
@@ -31,6 +31,7 @@ import (
"testing"
"time"
"github.com/google/gopacket/layers"
"golang.org/x/sync/errgroup"
"tailscale.com/client/local"
"tailscale.com/ipn"
@@ -67,6 +68,15 @@ type Env struct {
gokrazyKernel string // path to gokrazy kernel
qemuProcs []*exec.Cmd // launched QEMU processes
// Web UI support.
ctx context.Context // cancelled when test ends
eventBus *EventBus
testStatus *TestStatus
steps []*Step
nodeStatusMu sync.Mutex
nodeStatus map[string]*NodeStatus // keyed by node name
}
// logVerbosef logs a message only when --verbose-vm-debug is set.
@@ -77,6 +87,145 @@ func (e *Env) logVerbosef(format string, args ...any) {
}
}
// AddStep declares an expected stage of the test. The web UI shows all steps
// from the start, tracking their progress. Call before or during the test.
// Returns a *Step whose Begin/End methods drive the progress display.
func (e *Env) AddStep(name string) *Step {
s := &Step{
name: name,
index: len(e.steps),
env: e,
}
e.steps = append(e.steps, s)
return s
}
// Steps returns all declared steps in order.
func (e *Env) Steps() []*Step {
return e.steps
}
// publishStepChange publishes a step status change event.
func (e *Env) publishStepChange(s *Step) {
e.eventBus.Publish(VMEvent{
Type: EventStepChanged,
Message: fmt.Sprintf("%s %s", s.Status().Icon(), s.name),
Step: s,
})
}
// initNodeStatus initializes the NodeStatus for all nodes. Called after
// AddNode but before Start so the web UI can render them.
func (e *Env) initNodeStatus() {
e.nodeStatusMu.Lock()
defer e.nodeStatusMu.Unlock()
for _, n := range e.nodes {
nics := make([]NICStatus, len(n.nets))
for i := range n.nets {
nics[i] = NICStatus{
NetName: e.nicLabel(n, i),
DHCP: "waiting",
}
}
e.nodeStatus[n.name] = &NodeStatus{
Name: n.name,
OS: n.os.Name,
NICs: nics,
JoinsTailnet: n.joinTailnet,
Tailscale: "--",
}
}
}
// nicLabel returns a short human-readable label for a node's i-th NIC.
// After Start(), we can use the assigned LAN IP. Before that, we use "NIC N".
func (e *Env) nicLabel(n *Node, i int) string {
if n.vnetNode != nil {
ip := n.vnetNode.LanIP(n.nets[i])
if ip.IsValid() {
return ip.String()
}
}
return fmt.Sprintf("NIC %d", i)
}
// getNodeStatus returns the current status for a node.
func (e *Env) getNodeStatus(name string) NodeStatus {
e.nodeStatusMu.Lock()
defer e.nodeStatusMu.Unlock()
ns := e.nodeStatus[name]
if ns == nil {
return NodeStatus{Name: name, Tailscale: "--"}
}
return *ns
}
// setNodeDHCP updates the DHCP status for a specific NIC on a node.
func (e *Env) setNodeDHCP(name string, nicIdx int, status string) {
e.nodeStatusMu.Lock()
ns := e.nodeStatus[name]
if ns != nil && nicIdx < len(ns.NICs) {
ns.NICs[nicIdx].DHCP = status
}
e.nodeStatusMu.Unlock()
}
// setNodeTailscale updates the Tailscale status for a node and publishes
// an event so the web UI updates via WebSocket.
func (e *Env) setNodeTailscale(name, status string) {
e.nodeStatusMu.Lock()
ns := e.nodeStatus[name]
if ns != nil {
ns.Tailscale = status
}
e.nodeStatusMu.Unlock()
e.eventBus.Publish(VMEvent{
NodeName: name,
Type: EventTailscale,
Message: "Tailscale: " + status,
Detail: status,
})
}
// appendConsoleLine adds a line to a node's console buffer.
func (e *Env) appendConsoleLine(name, line string) {
e.nodeStatusMu.Lock()
ns := e.nodeStatus[name]
if ns != nil {
ns.Console = append(ns.Console, line)
if len(ns.Console) > maxConsoleLines {
ns.Console = ns.Console[len(ns.Console)-maxConsoleLines:]
}
}
e.nodeStatusMu.Unlock()
}
// nicIndexForMAC returns the NIC index (0-based) for a given MAC on a node.
// Returns -1 if not found.
func (e *Env) nicIndexForMAC(name string, mac vnet.MAC) int {
for _, n := range e.nodes {
if n.name != name {
continue
}
for i := range n.nets {
if n.vnetNode.NICMac(i) == mac {
return i
}
}
}
return -1
}
// nodeNameByNum returns the node name for a given vnet node number.
func (e *Env) nodeNameByNum(num int) string {
for _, n := range e.nodes {
if n.num == num {
return n.name
}
}
return fmt.Sprintf("node%d", num)
}
// New creates a new test environment. It skips the test if --run-vm-tests is not set.
func New(t testing.TB) *Env {
if !*runVMTests {
@@ -84,11 +233,23 @@ func New(t testing.TB) *Env {
}
tempDir := t.TempDir()
return &Env{
t: t,
tempDir: tempDir,
binDir: filepath.Join(tempDir, "bin"),
e := &Env{
t: t,
tempDir: tempDir,
binDir: filepath.Join(tempDir, "bin"),
eventBus: newEventBus(),
testStatus: newTestStatus(),
nodeStatus: make(map[string]*NodeStatus),
}
t.Cleanup(func() {
e.testStatus.finish(t.Failed())
e.eventBus.Publish(VMEvent{
Type: EventTestStatus,
Message: e.testStatus.State(),
Detail: formatDuration(e.testStatus.Elapsed()),
})
})
return e
}
// AddNetwork creates a new virtual network. Arguments follow the same pattern as
@@ -197,6 +358,11 @@ func (e *Env) Start() {
t := e.t
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
t.Cleanup(cancel)
e.ctx = ctx
// Initialize node status and start web UI as early as possible.
e.initNodeStatus()
e.maybeStartWebServer()
if err := os.MkdirAll(e.binDir, 0755); err != nil {
t.Fatal(err)
@@ -223,27 +389,94 @@ func (e *Env) Start() {
}
}
// Compile binaries and download/build images in parallel.
// Any failure cancels the others via the errgroup context.
eg, egCtx := errgroup.WithContext(ctx)
// Declare framework steps for the web UI.
// User-declared steps (from AddStep before Start) get moved to the end
// so framework steps (compile, image, QEMU, etc.) come first.
userSteps := e.steps
e.steps = nil
compileSteps := map[platform]*Step{}
for _, p := range needPlatform.Slice() {
eg.Go(func() error {
return e.compileBinariesForOS(egCtx, p.goos, p.goarch)
})
compileSteps[p] = e.AddStep(fmt.Sprintf("Compile %s_%s binaries", p.goos, p.goarch))
}
didOS := set.Set[string]{} // dedup by image name
imageSteps := map[string]*Step{} // keyed by OS name
didOS := set.Set[string]{} // dedup by image name
for _, n := range e.nodes {
if didOS.Contains(n.os.Name) {
continue
}
didOS.Add(n.os.Name)
if n.os.IsGokrazy {
imageSteps["gokrazy"] = e.AddStep("Build gokrazy image")
} else {
imageSteps[n.os.Name] = e.AddStep(fmt.Sprintf("Prepare %s image", n.os.Name))
}
}
vnetStep := e.AddStep("Create virtual network")
qemuSteps := map[string]*Step{}
agentSteps := map[string]*Step{}
tsUpSteps := map[string]*Step{}
for _, n := range e.nodes {
qemuSteps[n.name] = e.AddStep(fmt.Sprintf("Launch QEMU: %s", n.name))
agentSteps[n.name] = e.AddStep(fmt.Sprintf("Wait for agent: %s", n.name))
if n.joinTailnet {
tsUpSteps[n.name] = e.AddStep(fmt.Sprintf("Tailscale up: %s", n.name))
}
}
// Re-append user-declared steps after all framework steps.
for _, s := range userSteps {
s.index = len(e.steps)
e.steps = append(e.steps, s)
}
// Compile binaries and download/build images in parallel.
// Any failure cancels the others via the errgroup context.
eg, egCtx := errgroup.WithContext(ctx)
for _, p := range needPlatform.Slice() {
step := compileSteps[p]
eg.Go(func() error {
step.Begin()
err := e.compileBinariesForOS(egCtx, p.goos, p.goarch)
if err != nil {
step.End(err)
return err
}
step.End(nil)
return nil
})
}
didOS = set.Set[string]{} // reset for second pass
for _, n := range e.nodes {
if didOS.Contains(n.os.Name) {
continue
}
didOS.Add(n.os.Name)
if n.os.IsGokrazy {
step := imageSteps["gokrazy"]
eg.Go(func() error {
return e.ensureGokrazy(egCtx)
step.Begin()
err := e.ensureGokrazy(egCtx)
if err != nil {
step.End(err)
return err
}
step.End(nil)
return nil
})
} else {
step := imageSteps[n.os.Name]
osImg := n.os
eg.Go(func() error {
return ensureImage(egCtx, n.os)
step.Begin()
err := ensureImage(egCtx, osImg)
if err != nil {
step.End(err)
return err
}
step.End(nil)
return nil
})
}
}
@@ -252,6 +485,7 @@ func (e *Env) Start() {
}
// Create the vnet server.
vnetStep.Begin()
var err error
e.server, err = vnet.New(&e.cfg)
if err != nil {
@@ -259,6 +493,50 @@ func (e *Env) Start() {
}
t.Cleanup(func() { e.server.Close() })
// Register DHCP event callback for the web UI.
e.server.SetDHCPCallback(func(mac vnet.MAC, nodeNum int, msgType layers.DHCPMsgType, ip netip.Addr) {
name := e.nodeNameByNum(nodeNum)
nicIdx := e.nicIndexForMAC(name, mac)
ipStr := ip.String()
switch msgType {
case layers.DHCPMsgTypeDiscover:
e.setNodeDHCP(name, nicIdx, "Discover sent")
e.eventBus.Publish(VMEvent{
NodeName: name,
Type: EventDHCPDiscover,
Message: "DHCP Discover sent",
NIC: nicIdx,
})
case layers.DHCPMsgTypeOffer:
e.setNodeDHCP(name, nicIdx, "Offered "+ipStr)
e.eventBus.Publish(VMEvent{
NodeName: name,
Type: EventDHCPOffer,
Message: "DHCP Offer received",
Detail: ipStr,
NIC: nicIdx,
})
case layers.DHCPMsgTypeRequest:
e.setNodeDHCP(name, nicIdx, "Requesting "+ipStr)
e.eventBus.Publish(VMEvent{
NodeName: name,
Type: EventDHCPRequest,
Message: "DHCP Request sent",
Detail: ipStr,
NIC: nicIdx,
})
case layers.DHCPMsgTypeAck:
e.setNodeDHCP(name, nicIdx, "Got "+ipStr)
e.eventBus.Publish(VMEvent{
NodeName: name,
Type: EventDHCPAck,
Message: "DHCP Ack: got " + ipStr,
Detail: ipStr,
NIC: nicIdx,
})
}
})
// Register compiled binaries with the file server VIP.
// Binaries are registered at <goos>_<goarch>/<name> (e.g. "linux_amd64/tta").
for _, p := range needPlatform.Slice() {
@@ -271,6 +549,7 @@ func (e *Env) Start() {
e.server.RegisterFile(dir+"/"+name, data)
}
}
vnetStep.End(nil)
// Cloud-init config is delivered via local seed ISOs (created in startCloudQEMU),
// not via the cloud-init HTTP VIP, because network-config must be available
@@ -296,9 +575,12 @@ func (e *Env) Start() {
// Launch QEMU processes.
for _, n := range e.nodes {
step := qemuSteps[n.name]
step.Begin()
if err := e.startQEMU(n); err != nil {
t.Fatalf("startQEMU(%s): %v", n.name, err)
}
step.End(nil)
}
// Set up agent clients and wait for all agents to connect.
@@ -311,12 +593,15 @@ func (e *Env) Start() {
var agentEg errgroup.Group
for _, n := range e.nodes {
agentEg.Go(func() error {
aStep := agentSteps[n.name]
aStep.Begin()
t.Logf("[%s] waiting for agent...", n.name)
st, err := n.agent.Status(ctx)
if err != nil {
return fmt.Errorf("[%s] agent status: %w", n.name, err)
}
t.Logf("[%s] agent connected, backend state: %s", n.name, st.BackendState)
aStep.End(nil)
if n.vnetNode.HostFirewall() {
if err := n.agent.EnableHostFirewall(ctx); err != nil {
@@ -325,6 +610,8 @@ func (e *Env) Start() {
}
if n.joinTailnet {
tsStep := tsUpSteps[n.name]
tsStep.Begin()
if err := e.tailscaleUp(ctx, n); err != nil {
return fmt.Errorf("[%s] tailscale up: %w", n.name, err)
}
@@ -335,7 +622,10 @@ func (e *Env) Start() {
if st.BackendState != "Running" {
return fmt.Errorf("[%s] state = %q, want Running", n.name, st.BackendState)
}
ips := fmt.Sprintf("%v", st.Self.TailscaleIPs)
e.setNodeTailscale(n.name, "Running "+ips)
t.Logf("[%s] up with %v", n.name, st.Self.TailscaleIPs)
tsStep.End(nil)
}
return nil