tstest/natlab, .github/workflows: add opt-in natlab CI workflow
The natlab vmtest suite (tstest/natlab/vmtest) and the integration nat
tests are gated behind --run-vm-tests because they need KVM and are
slow. Until now nothing in CI exercised them apart from a single
canary TestEasyEasy run on every PR.
Add .github/workflows/natlab-test.yml that runs the full opt-in suite
on demand (workflow_dispatch), on PRs labeled "natlab", and on main
every 12 hours via cron. The workflow has two phases:
- "prepare" builds the gokrazy VM image, downloads the Ubuntu and
FreeBSD cloud images once via the new natlabprep tool, and emits
a dynamic JSON matrix of every TestX function it finds in the two
opt-in packages.
- "test" is a per-test matrix that depends on prepare. Each matrix
job restores the shared caches and runs a single test, so adding
a new TestFoo is automatically picked up on the next run without
any workflow edits.
Rename the existing natlab-integrationtest.yml to natlab-basic.yml
since it's the small smoke variant (just TestEasyEasy on every PR);
the new natlab-test.yml is the bigger suite. The job inside is
renamed to EasyEasy for the same reason.
Move the macOS arm64 host check from vmtest.Env.Start into
vmtest.Env.AddNode so a test that adds a vmtest.MacOS node skips
immediately on a non-macOS host, and add an explicit
skipIfNotMacOSArm64 helper at the top of the two macOS-only tests
so the platform requirement is obvious to readers.
Quiet the takeAgentConnOne miss log in tstest/natlab/vnet by default
(it was the overwhelming majority of bytes in CI logs, with no signal
in healthy runs) and replace it with a periodic "still waiting" line
that only fires after 10s, so a truly stuck agent connection still
surfaces.
Updates #13038
Change-Id: I4582098d8865200fd5a73a9b696942319ccf3bf0
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
committed by
Brad Fitzpatrick
parent
4eec4423b4
commit
e062b46984
@@ -2583,14 +2583,24 @@ func (s *Server) addIdleAgentConn(ac *agentConn) {
|
||||
|
||||
func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok bool) {
|
||||
const debug = false
|
||||
// stuckThreshold is how long we wait before deciding the agent is slow
|
||||
// enough to warrant a log line. Below this we stay quiet because, in
|
||||
// healthy runs with many agent dials in flight, even a few-millisecond
|
||||
// wait would otherwise log every poll for every concurrent waiter.
|
||||
const stuckThreshold = 10 * time.Second
|
||||
start := time.Now()
|
||||
var lastWarn time.Time
|
||||
for {
|
||||
ac, ok := s.takeAgentConnOne(n)
|
||||
if ok {
|
||||
ac, miss := s.takeAgentConnOne(n)
|
||||
if ac != nil {
|
||||
if debug {
|
||||
log.Printf("takeAgentConn: got agent conn for %v", n.mac)
|
||||
}
|
||||
return ac, true
|
||||
}
|
||||
if debug && miss > 0 {
|
||||
log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac)
|
||||
}
|
||||
s.mu.Lock()
|
||||
ready := make(chan struct{})
|
||||
mak.Set(&s.agentConnWaiter, n, ready)
|
||||
@@ -2599,6 +2609,10 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b
|
||||
if debug {
|
||||
log.Printf("takeAgentConn: waiting for agent conn for %v", n.mac)
|
||||
}
|
||||
if elapsed := time.Since(start); elapsed > stuckThreshold && time.Since(lastWarn) > stuckThreshold {
|
||||
log.Printf("takeAgentConn: still waiting for agent conn for %v after %v (%d idle conns for other nodes)", n.mac, elapsed.Round(time.Second), miss)
|
||||
lastWarn = time.Now()
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, false
|
||||
@@ -2611,21 +2625,21 @@ func (s *Server) takeAgentConn(ctx context.Context, n *node) (_ *agentConn, ok b
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) takeAgentConnOne(n *node) (_ *agentConn, ok bool) {
|
||||
// takeAgentConnOne returns an idle agent conn for n if one is available,
|
||||
// otherwise nil. miss is the number of idle agent conns for other nodes that
|
||||
// were walked over while looking; the caller may use it for diagnostics when
|
||||
// a wait drags on.
|
||||
func (s *Server) takeAgentConnOne(n *node) (ac *agentConn, miss int) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
miss := 0
|
||||
for ac := range s.agentConns {
|
||||
if ac.node == n {
|
||||
s.agentConns.Delete(ac)
|
||||
return ac, true
|
||||
return ac, 0
|
||||
}
|
||||
miss++
|
||||
}
|
||||
if miss > 0 {
|
||||
log.Printf("takeAgentConnOne: missed %d times for %v", miss, n.mac)
|
||||
}
|
||||
return nil, false
|
||||
return nil, miss
|
||||
}
|
||||
|
||||
type NodeAgentClient struct {
|
||||
|
||||
Reference in New Issue
Block a user