From 2802a01b8199a10c41241eb2699ab158be276df7 Mon Sep 17 00:00:00 2001 From: Christine Dodrill Date: Mon, 31 May 2021 17:04:49 -0400 Subject: [PATCH] tstest/integration/vms: test vms as they are ready (#2022) Instead of testing all the VMs at once when they are all ready, this patch changes the testing logic so that the vms are tested as soon as they register with testcontrol. Also limit the amount of VM ram used at once with the `-ram-limit` flag. That uses a semaphore to guard resource use. Also document CentOS' sins. Updates #1988 Signed-off-by: Christine Dodrill --- tstest/integration/vms/vms_test.go | 217 ++++++++++++++--------------- 1 file changed, 105 insertions(+), 112 deletions(-) diff --git a/tstest/integration/vms/vms_test.go b/tstest/integration/vms/vms_test.go index 25c612879..236b258e1 100644 --- a/tstest/integration/vms/vms_test.go +++ b/tstest/integration/vms/vms_test.go @@ -7,6 +7,7 @@ package vms import ( + "context" "crypto/sha256" "encoding/hex" "flag" @@ -31,6 +32,7 @@ import ( expect "github.com/google/goexpect" "github.com/pkg/sftp" "golang.org/x/crypto/ssh" + "golang.org/x/sync/semaphore" "inet.af/netaddr" "tailscale.com/net/interfaces" "tailscale.com/tstest" @@ -40,7 +42,8 @@ import ( const securePassword = "hunter2" -var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive (10G+ ram) VM based integration tests") +var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive VM based integration tests") +var vmRamLimit = flag.Int("ram-limit", 4096, "the maximum number of megabytes of ram that can be used for VMs, must be greater than or equal to 1024") var distroRex *regexValue = func() *regexValue { result := ®exValue{r: regexp.MustCompile(`.*`)} flag.Var(result, "distro-regex", "The regex that matches what distros should be run") @@ -259,9 +262,8 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() { if err != nil { t.Fatalf("can't find cache dir: %v", err) } - cdir = filepath.Join(cdir, "within", "mkvm") + cdir = filepath.Join(cdir, "tailscale", "vm-test") os.MkdirAll(filepath.Join(cdir, "qcow2"), 0755) - os.MkdirAll(filepath.Join(cdir, "seed"), 0755) port := 23100 + n @@ -280,6 +282,7 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() { "-drive", driveArg, "-cdrom", filepath.Join(tdir, d.name, "seed", "seed.iso"), "-vnc", fmt.Sprintf(":%d", n), + "-smbios", "type=1,serial=ds=nocloud;h=" + d.name, } t.Logf("running: qemu-system-x86_64 %s", strings.Join(args, " ")) @@ -378,7 +381,7 @@ func TestVMIntegrationEndToEnd(t *testing.T) { var ( ipMu sync.Mutex - ipMap = []ipMapping{} + ipMap = map[string]ipMapping{} ) mux := http.NewServeMux() @@ -398,7 +401,8 @@ func TestVMIntegrationEndToEnd(t *testing.T) { if err != nil { log.Panicf("bad port: %v", port) } - ipMap = append(ipMap, ipMapping{r.UserAgent(), port, host}) + distro := r.UserAgent() + ipMap[distro] = ipMapping{distro, port, host} t.Logf("%s: %v", name, host) }) @@ -424,140 +428,129 @@ func TestVMIntegrationEndToEnd(t *testing.T) { loginServer := fmt.Sprintf("http://%s", ln.Addr()) t.Logf("loginServer: %s", loginServer) - var numDistros = 0 + tstest.FixLogs(t) + defer tstest.UnfixLogs(t) - cancels := make(chan func(), len(distros)) + ramsem := semaphore.NewWeighted(int64(*vmRamLimit)) - t.Run("mkvm", func(t *testing.T) { + t.Run("do", func(t *testing.T) { for n, distro := range distros { n, distro := n, distro if rex.MatchString(distro.name) { t.Logf("%s matches %s", distro.name, rex) - numDistros++ } else { continue } t.Run(distro.name, func(t *testing.T) { + ctx, done := context.WithCancel(context.Background()) + defer done() + + if distro.name == "opensuse-leap-15-1" { + t.Skip("OpenSUSE Leap 15.1's cloud-init image just doesn't work for some reason, see https://github.com/tailscale/tailscale/issues/1988") + } + t.Parallel() + err := ramsem.Acquire(ctx, int64(distro.mem)) + if err != nil { + t.Fatalf("can't acquire ram semaphore: %v", err) + } + defer ramsem.Release(int64(distro.mem)) + cancel := mkVM(t, n, distro, string(pubkey), loginServer, dir) - cancels <- cancel + defer cancel() + var ipm ipMapping + + t.Run("wait-for-start", func(t *testing.T) { + waiter := time.NewTicker(time.Second) + defer waiter.Stop() + var ok bool + for { + <-waiter.C + ipMu.Lock() + if ipm, ok = ipMap[distro.name]; ok { + ipMu.Unlock() + break + } + ipMu.Unlock() + } + }) + + testDistro(t, loginServer, signer, ipm) }) } }) +} - close(cancels) - for cancel := range cancels { - //lint:ignore SA9001 They do actually get ran - defer cancel() - - if len(cancels) == 0 { - t.Log("all VMs started") +func testDistro(t *testing.T, loginServer string, signer ssh.Signer, ipm ipMapping) { + t.Helper() + port := ipm.port + hostport := fmt.Sprintf("127.0.0.1:%d", port) + ccfg := &ssh.ClientConfig{ + User: "root", + Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)}, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), + } + + // NOTE(Xe): This deadline loop helps to make things a bit faster, centos + // sometimes is slow at starting its sshd and will sometimes randomly kill + // SSH sessions on transition to multi-user.target. I don't know why they + // don't use socket activation. + const maxRetries = 5 + var working bool + for i := 0; i < maxRetries; i++ { + cli, err := ssh.Dial("tcp", hostport, ccfg) + if err == nil { + working = true + cli.Close() break } - } - t.Run("wait-for-vms", func(t *testing.T) { - t.Log("waiting for VMs to register") - waiter := time.NewTicker(time.Second) - defer waiter.Stop() - n := 0 - for { - <-waiter.C - ipMu.Lock() - if len(ipMap) == numDistros { - ipMu.Unlock() - break - } else { - if n%30 == 0 { - t.Logf("ipMap: %d", len(ipMap)) - t.Logf("distros: %d", numDistros) - } - } - n++ - ipMu.Unlock() - } - }) - - ipMu.Lock() - defer ipMu.Unlock() - t.Run("join-net", func(t *testing.T) { - for _, ipm := range ipMap { - ipm := ipm - port := ipm.port - t.Run(ipm.name, func(t *testing.T) { - tstest.FixLogs(t) - t.Parallel() - - hostport := fmt.Sprintf("127.0.0.1:%d", port) - - // NOTE(Xe): This retry loop helps to make things a bit faster, centos sometimes is slow at starting its sshd. I don't know why they don't use socket activation. - const maxRetries = 5 - var working bool - for i := 0; i < maxRetries; i++ { - conn, err := net.Dial("tcp", hostport) - if err == nil { - working = true - conn.Close() - break - } - - time.Sleep(5 * time.Second) - } - - if !working { - t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries) - } + time.Sleep(10 * time.Second) + } - t.Logf("about to ssh into 127.0.0.1:%d", port) - cli, err := ssh.Dial("tcp", hostport, &ssh.ClientConfig{ - User: "root", - Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)}, - HostKeyCallback: ssh.InsecureIgnoreHostKey(), - }) - if err != nil { - t.Fatal(err) - } - copyBinaries(t, cli) + if !working { + t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries) + } - timeout := 5 * time.Minute + t.Logf("about to ssh into 127.0.0.1:%d", port) + cli, err := ssh.Dial("tcp", hostport, ccfg) + if err != nil { + t.Fatal(err) + } + copyBinaries(t, cli) - e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer())) - if err != nil { - t.Fatalf("%d: can't register a shell session: %v", port, err) - } - defer e.Close() + timeout := 5 * time.Minute - t.Log("opened session") + e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer())) + if err != nil { + t.Fatalf("%d: can't register a shell session: %v", port, err) + } + defer e.Close() - _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) - if err != nil { - t.Fatalf("%d: can't get a shell: %v", port, err) - } - t.Logf("got shell for %d", port) - err = e.Send("systemctl start tailscaled.service\n") - if err != nil { - t.Fatalf("can't send command to start tailscaled: %v", err) - } - _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) - if err != nil { - t.Fatalf("%d: can't get a shell: %v", port, err) - } - err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer)) - if err != nil { - t.Fatalf("%d: can't send tailscale up command: %v", port, err) - } - _, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout) - if err != nil { - t.Fatalf("not successful: %v", err) - } - }) - } - }) + t.Log("opened session") - if numNodes := cs.NumNodes(); numNodes != len(ipMap) { - t.Errorf("wanted %d nodes, got: %d", len(ipMap), numNodes) + _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) + if err != nil { + t.Fatalf("%d: can't get a shell: %v", port, err) + } + t.Logf("got shell for %d", port) + err = e.Send("systemctl start tailscaled.service\n") + if err != nil { + t.Fatalf("can't send command to start tailscaled: %v", err) + } + _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) + if err != nil { + t.Fatalf("%d: can't get a shell: %v", port, err) + } + err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer)) + if err != nil { + t.Fatalf("%d: can't send tailscale up command: %v", port, err) + } + _, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout) + if err != nil { + t.Fatalf("not successful: %v", err) } }