7dcb378875
The natlab-integrationtest CI job frequently flakes by exhausting its 3m go test timeout. The root cause is that the QEMU VMs run under pure software emulation (TCG) with no KVM. Under TCG, the guest kernel's timer calibration busy-loops are at the mercy of host CPU scheduling. When two VMs boot simultaneously on a 2-core CI runner, one VM's calibration gets starved and produces wrong results, leaving the kernel with broken timers that prevent it from ever completing boot — even after the other VM finishes and frees up CPU. Additionally, the microvm machine type doesn't provide HPET hardware, but the kernel command line specified clocksource=hpet. And the VM image build (make natlab) ran inside the test itself, consuming most of the 3m timeout budget before the actual test started. Fix by: - Enabling KVM when /dev/kvm is available, so timer calibration uses real hardware timers unaffected by host CPU scheduling. - Adding a CI step to set /dev/kvm permissions on the GitHub Actions runner (ubuntu-latest provides KVM but needs a udev rule). - Pre-building the VM image in a separate CI step so it doesn't cut into the go test -timeout budget. - Replacing the hardcoded 60s context timeout with one derived from t.Deadline(), so the test uses the full -timeout budget. - Adding VM boot progress detection (AwaitFirstPacket) and QMP diagnostics, so boot failures produce clear errors instead of opaque "context deadline exceeded" messages. With KVM enabled, the test passes reliably even on a single CPU core with 3 parallel workers — a scenario that was 100% broken under TCG. Fixes #18906 Change-Id: I4c87631a9c9678d185b9f30cb05c0f7bfa9f5c62 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
1003 lines
26 KiB
Go
1003 lines
26 KiB
Go
// Copyright (c) Tailscale Inc & contributors
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
package nat
|
|
|
|
import (
|
|
"bytes"
|
|
"cmp"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"net"
|
|
"net/http"
|
|
"net/netip"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"golang.org/x/mod/modfile"
|
|
"golang.org/x/sync/errgroup"
|
|
"tailscale.com/client/tailscale"
|
|
"tailscale.com/envknob"
|
|
"tailscale.com/ipn/ipnstate"
|
|
"tailscale.com/syncs"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/tstest/natlab/vnet"
|
|
)
|
|
|
|
var (
|
|
runVMTests = flag.Bool("run-vm-tests", false, "run tests that require a VM")
|
|
logTailscaled = flag.Bool("log-tailscaled", false, "log tailscaled output")
|
|
pcapFile = flag.String("pcap", "", "write pcap to file")
|
|
)
|
|
|
|
type natTest struct {
|
|
tb testing.TB
|
|
base string // base image
|
|
tempDir string // for qcow2 images
|
|
vnet *vnet.Server
|
|
kernel string // linux kernel path
|
|
|
|
gotRoute pingRoute
|
|
}
|
|
|
|
func newNatTest(tb testing.TB) *natTest {
|
|
root, err := os.Getwd()
|
|
if err != nil {
|
|
tb.Fatal(err)
|
|
}
|
|
modRoot := filepath.Join(root, "../../..")
|
|
|
|
nt := &natTest{
|
|
tb: tb,
|
|
tempDir: tb.TempDir(),
|
|
base: filepath.Join(modRoot, "gokrazy/natlabapp.qcow2"),
|
|
}
|
|
|
|
if !*runVMTests {
|
|
tb.Skip("skipping heavy test; set --run-vm-tests to run")
|
|
}
|
|
|
|
if _, err := os.Stat(nt.base); err != nil {
|
|
if !os.IsNotExist(err) {
|
|
tb.Fatal(err)
|
|
}
|
|
tb.Logf("building VM image...")
|
|
cmd := exec.Command("make", "natlab")
|
|
cmd.Dir = filepath.Join(modRoot, "gokrazy")
|
|
cmd.Stderr = os.Stderr
|
|
cmd.Stdout = os.Stdout
|
|
if err := cmd.Run(); err != nil {
|
|
tb.Fatalf("Error running 'make natlab' in gokrazy directory: %v", err)
|
|
}
|
|
if _, err := os.Stat(nt.base); err != nil {
|
|
tb.Skipf("still can't find VM image: %v", err)
|
|
}
|
|
}
|
|
|
|
nt.kernel, err = findKernelPath(filepath.Join(modRoot, "go.mod"))
|
|
if err != nil {
|
|
tb.Skipf("skipping test; kernel not found: %v", err)
|
|
}
|
|
tb.Logf("found kernel: %v", nt.kernel)
|
|
|
|
return nt
|
|
}
|
|
|
|
func findKernelPath(goMod string) (string, error) {
|
|
b, err := os.ReadFile(goMod)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
mf, err := modfile.Parse("go.mod", b, nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
goModB, err := exec.Command("go", "env", "GOMODCACHE").CombinedOutput()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
for _, r := range mf.Require {
|
|
if r.Mod.Path == "github.com/tailscale/gokrazy-kernel" {
|
|
return strings.TrimSpace(string(goModB)) + "/" + r.Mod.String() + "/vmlinuz", nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("failed to find kernel in %v", goMod)
|
|
}
|
|
|
|
type addNodeFunc func(c *vnet.Config) *vnet.Node // returns nil to omit test
|
|
|
|
func v6cidr(n int) string {
|
|
return fmt.Sprintf("2000:%d::1/64", n)
|
|
}
|
|
|
|
func easy(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT))
|
|
}
|
|
|
|
func easyAnd6(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n),
|
|
v6cidr(n),
|
|
vnet.EasyNAT))
|
|
}
|
|
|
|
// easyNoControlDiscoRotate sets up a node with easy NAT, cuts traffic to
|
|
// control after connecting, and then rotates the disco key to simulate a newly
|
|
// started node (from a disco perspective).
|
|
func easyNoControlDiscoRotate(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
nw := c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n),
|
|
vnet.EasyNAT)
|
|
nw.SetPostConnectControlBlackhole(true)
|
|
return c.AddNode(
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_USE_CACHED_NETMAP",
|
|
Value: "true",
|
|
},
|
|
vnet.RotateDisco, vnet.PreICMPPing, nw)
|
|
}
|
|
|
|
func v6AndBlackholedIPv4(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
nw := c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n),
|
|
v6cidr(n),
|
|
vnet.EasyNAT)
|
|
nw.SetBlackholedIPv4(true)
|
|
return c.AddNode(nw)
|
|
}
|
|
|
|
func just6(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(v6cidr(n))) // public IPv6 prefix
|
|
}
|
|
|
|
// easy + host firewall
|
|
func easyFW(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(vnet.HostFirewall, c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT))
|
|
}
|
|
|
|
func easyAF(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyAFNAT))
|
|
}
|
|
|
|
func sameLAN(c *vnet.Config) *vnet.Node {
|
|
nw := c.FirstNetwork()
|
|
if nw == nil {
|
|
return nil
|
|
}
|
|
if !nw.CanTakeMoreNodes() {
|
|
return nil
|
|
}
|
|
return c.AddNode(nw)
|
|
}
|
|
|
|
func one2one(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("172.16.%d.1/24", n), vnet.One2OneNAT))
|
|
}
|
|
|
|
func easyPMP(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT, vnet.NATPMP))
|
|
}
|
|
|
|
// easy + port mapping + host firewall + BPF
|
|
func easyPMPFWPlusBPF(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(
|
|
vnet.HostFirewall,
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_ENABLE_RAW_DISCO",
|
|
Value: "true",
|
|
},
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_DEBUG_RAW_DISCO",
|
|
Value: "1",
|
|
},
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_DEBUG_DISCO",
|
|
Value: "1",
|
|
},
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_LOG_VERBOSITY",
|
|
Value: "2",
|
|
},
|
|
c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT, vnet.NATPMP))
|
|
}
|
|
|
|
// easy + port mapping + host firewall - BPF
|
|
func easyPMPFWNoBPF(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(
|
|
vnet.HostFirewall,
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_ENABLE_RAW_DISCO",
|
|
Value: "false",
|
|
},
|
|
c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT, vnet.NATPMP))
|
|
}
|
|
|
|
func hard(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("10.0.%d.1/24", n), vnet.HardNAT))
|
|
}
|
|
|
|
func hardNoDERPOrEndoints(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("10.0.%d.1/24", n), vnet.HardNAT),
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_DEBUG_STRIP_ENDPOINTS",
|
|
Value: "1",
|
|
},
|
|
vnet.TailscaledEnv{
|
|
Key: "TS_DEBUG_STRIP_HOME_DERP",
|
|
Value: "1",
|
|
},
|
|
)
|
|
}
|
|
|
|
func hardPMP(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
fmt.Sprintf("10.7.%d.1/24", n), vnet.HardNAT, vnet.NATPMP))
|
|
}
|
|
|
|
func (nt *natTest) setupTest(ctx context.Context, addNode ...addNodeFunc) (nodes []*vnet.Node, clients []*vnet.NodeAgentClient, cleanup func()) {
|
|
if len(addNode) < 1 || len(addNode) > 2 {
|
|
nt.tb.Fatalf("runTest: invalid number of nodes %v; want 1 or 2", len(addNode))
|
|
}
|
|
t := nt.tb
|
|
|
|
var c vnet.Config
|
|
c.SetPCAPFile(*pcapFile)
|
|
for _, fn := range addNode {
|
|
node := fn(&c)
|
|
if node == nil {
|
|
t.Skip("skipping test; not applicable combination")
|
|
}
|
|
nodes = append(nodes, node)
|
|
if *logTailscaled {
|
|
node.SetVerboseSyslog(true)
|
|
}
|
|
}
|
|
|
|
var err error
|
|
nt.vnet, err = vnet.New(&c)
|
|
if err != nil {
|
|
t.Fatalf("newServer: %v", err)
|
|
}
|
|
nt.tb.Cleanup(func() {
|
|
nt.vnet.Close()
|
|
})
|
|
|
|
var wg sync.WaitGroup // waiting for srv.Accept goroutine
|
|
defer wg.Wait()
|
|
|
|
sockAddr := filepath.Join(nt.tempDir, "qemu.sock")
|
|
srv, err := net.Listen("unix", sockAddr)
|
|
if err != nil {
|
|
t.Fatalf("Listen: %v", err)
|
|
}
|
|
defer srv.Close()
|
|
|
|
wg.Go(func() {
|
|
for {
|
|
c, err := srv.Accept()
|
|
if err != nil {
|
|
return
|
|
}
|
|
go nt.vnet.ServeUnixConn(c.(*net.UnixConn), vnet.ProtocolQEMU)
|
|
}
|
|
})
|
|
|
|
haveKVM := false
|
|
if runtime.GOOS == "linux" {
|
|
if f, err := os.OpenFile("/dev/kvm", os.O_RDWR, 0); err == nil {
|
|
f.Close()
|
|
haveKVM = true
|
|
}
|
|
}
|
|
|
|
qmpSocks := make([]string, len(nodes))
|
|
for i, node := range nodes {
|
|
disk := fmt.Sprintf("%s/node-%d.qcow2", nt.tempDir, i)
|
|
out, err := exec.Command("qemu-img", "create",
|
|
"-f", "qcow2",
|
|
"-F", "qcow2",
|
|
"-b", nt.base,
|
|
disk).CombinedOutput()
|
|
if err != nil {
|
|
t.Fatalf("qemu-img create: %v, %s", err, out)
|
|
}
|
|
|
|
var envBuf bytes.Buffer
|
|
for _, e := range node.Env() {
|
|
fmt.Fprintf(&envBuf, " tailscaled.env=%s=%s", e.Key, e.Value)
|
|
}
|
|
sysLogAddr := net.JoinHostPort(vnet.FakeSyslogIPv4().String(), "995")
|
|
if node.IsV6Only() {
|
|
fmt.Fprintf(&envBuf, " tta.nameserver=%s", vnet.FakeDNSIPv6())
|
|
sysLogAddr = net.JoinHostPort(vnet.FakeSyslogIPv6().String(), "995")
|
|
}
|
|
envStr := envBuf.String()
|
|
|
|
qmpSocks[i] = fmt.Sprintf("%s/qmp-node-%d.sock", nt.tempDir, i)
|
|
qemuArgs := []string{
|
|
"-M", "microvm,isa-serial=off",
|
|
"-m", "384M",
|
|
"-nodefaults", "-no-user-config", "-nographic",
|
|
"-kernel", nt.kernel,
|
|
"-append", "console=hvc0 root=PARTUUID=60c24cc1-f3f9-427a-8199-76baa2d60001/PARTNROFF=1 ro init=/gokrazy/init panic=10 oops=panic pci=off nousb gokrazy.remote_syslog.target=" + sysLogAddr + " tailscale-tta=1" + envStr,
|
|
"-drive", "id=blk0,file=" + disk + ",format=qcow2",
|
|
"-device", "virtio-blk-device,drive=blk0",
|
|
"-netdev", "stream,id=net0,addr.type=unix,addr.path=" + sockAddr,
|
|
"-device", "virtio-serial-device",
|
|
"-device", "virtio-rng-device",
|
|
"-device", "virtio-net-device,netdev=net0,mac=" + node.MAC().String(),
|
|
"-chardev", "stdio,id=virtiocon0,mux=on",
|
|
"-device", "virtconsole,chardev=virtiocon0",
|
|
"-mon", "chardev=virtiocon0,mode=readline",
|
|
"-qmp", "unix:" + qmpSocks[i] + ",server=on,wait=off",
|
|
}
|
|
if haveKVM {
|
|
qemuArgs = append(qemuArgs, "-enable-kvm", "-cpu", "host")
|
|
}
|
|
cmd := exec.Command("qemu-system-x86_64", qemuArgs...)
|
|
cmd.Stdout = os.Stdout
|
|
cmd.Stderr = os.Stderr
|
|
if err := cmd.Start(); err != nil {
|
|
t.Fatalf("qemu: %v", err)
|
|
}
|
|
nt.tb.Cleanup(func() {
|
|
cmd.Process.Kill()
|
|
cmd.Wait()
|
|
})
|
|
}
|
|
|
|
for i, node := range nodes {
|
|
if err := nt.vnet.AwaitFirstPacket(ctx, node.MAC()); err != nil {
|
|
t.Logf("node %v: no boot progress (no packets received): %v", node, err)
|
|
t.Logf("node %v: QMP status: %s", node, qmpQueryStatus(qmpSocks[i]))
|
|
t.FailNow()
|
|
}
|
|
t.Logf("node %v: boot detected (first packet received)", node)
|
|
}
|
|
|
|
for _, n := range nodes {
|
|
client := nt.vnet.NodeAgentClient(n)
|
|
n.SetClient(client)
|
|
clients = append(clients, client)
|
|
}
|
|
|
|
var eg errgroup.Group
|
|
for i, c := range clients {
|
|
eg.Go(func() error {
|
|
node := nodes[i]
|
|
t.Logf("%v calling Status...", node)
|
|
st, err := c.Status(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("%v status: %w", node, err)
|
|
}
|
|
t.Logf("%v status: %v", node, st.BackendState)
|
|
|
|
if node.HostFirewall() {
|
|
if err := c.EnableHostFirewall(ctx); err != nil {
|
|
return fmt.Errorf("%v firewall: %w", node, err)
|
|
}
|
|
t.Logf("%v firewalled", node)
|
|
}
|
|
|
|
if node.ShouldJoinTailnet() {
|
|
if err := up(ctx, c); err != nil {
|
|
return fmt.Errorf("%v up: %w", node, err)
|
|
}
|
|
t.Logf("%v up!", node)
|
|
|
|
st, err = c.Status(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("%v status: %w", node, err)
|
|
}
|
|
|
|
if st.BackendState != "Running" {
|
|
return fmt.Errorf("%v state = %q", node, st.BackendState)
|
|
}
|
|
|
|
t.Logf("%v AllowedIPs: %v", node, st.Self.Addrs)
|
|
t.Logf("%v up with %v", node, st.Self.TailscaleIPs)
|
|
} else {
|
|
t.Logf("%v skipping joining tailnet", node)
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
if err := eg.Wait(); err != nil {
|
|
t.Fatalf("initial setup: %v", err)
|
|
}
|
|
|
|
return nodes, clients, nt.vnet.Close
|
|
}
|
|
|
|
type hasDeadline interface {
|
|
Deadline() (deadline time.Time, ok bool)
|
|
}
|
|
|
|
// testContext returns a context derived from the test's deadline (from -timeout),
|
|
// leaving a small margin for cleanup. Falls back to 60s if no deadline is set.
|
|
func testContext(tb testing.TB) (context.Context, context.CancelFunc) {
|
|
if t, ok := tb.(hasDeadline); ok {
|
|
if dl, ok := t.Deadline(); ok {
|
|
const margin = 5 * time.Second
|
|
return context.WithDeadline(context.Background(), dl.Add(-margin))
|
|
}
|
|
}
|
|
return context.WithTimeout(context.Background(), 60*time.Second)
|
|
}
|
|
|
|
func (nt *natTest) runHostConnectivityTest(addNode ...addNodeFunc) bool {
|
|
ctx, cancel := testContext(nt.tb)
|
|
defer cancel()
|
|
nodes, clients, cleanup := nt.setupTest(ctx, addNode...)
|
|
defer cleanup()
|
|
|
|
if len(nodes) != 2 {
|
|
nt.tb.Logf("ping can only be done among exactly two nodes")
|
|
return false
|
|
}
|
|
var fromClient, toClient *vnet.NodeAgentClient
|
|
for i, n := range nodes {
|
|
if n.ShouldJoinTailnet() && fromClient == nil {
|
|
fromClient = clients[i]
|
|
} else {
|
|
toClient = clients[i]
|
|
}
|
|
}
|
|
got, err := sendHostNetworkPing(ctx, nt.tb, fromClient, toClient)
|
|
if err != nil {
|
|
nt.tb.Fatalf("ping host: %v", err)
|
|
}
|
|
nt.tb.Logf("ping success: %v", got)
|
|
return got
|
|
}
|
|
|
|
func (nt *natTest) runTailscaleConnectivityTest(addNode ...addNodeFunc) pingRoute {
|
|
ctx, cancel := testContext(nt.tb)
|
|
defer cancel()
|
|
|
|
nodes, clients, cleanup := nt.setupTest(ctx, addNode...)
|
|
defer cleanup()
|
|
t := nt.tb
|
|
|
|
if len(nodes) < 2 {
|
|
return ""
|
|
}
|
|
for _, n := range nodes {
|
|
if !n.ShouldJoinTailnet() {
|
|
t.Logf("%v did not join tailnet", n)
|
|
return ""
|
|
}
|
|
}
|
|
|
|
sts := make([]*ipnstate.Status, len(nodes))
|
|
var eg errgroup.Group
|
|
for i, c := range clients {
|
|
eg.Go(func() error {
|
|
node := nodes[i]
|
|
st, err := c.Status(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("%v: %w", node, err)
|
|
}
|
|
sts[i] = st
|
|
return nil
|
|
})
|
|
}
|
|
if err := eg.Wait(); err != nil {
|
|
t.Fatalf("get node statuses: %v", err)
|
|
}
|
|
|
|
preICMPPing := false
|
|
for _, node := range nodes {
|
|
node.Network().PostConnectedToControl()
|
|
if err := node.PostConnectedToControl(ctx); err != nil {
|
|
t.Fatalf("post control error: %s", err)
|
|
}
|
|
if node.PreICMPPing() {
|
|
preICMPPing = true
|
|
}
|
|
}
|
|
|
|
// Should we send traffic across the nodes before starting disco?
|
|
// For nodes that rotated disco keys after control going away.
|
|
if preICMPPing {
|
|
_, err := ping(ctx, t, clients[0], sts[1].Self.TailscaleIPs[0], tailcfg.PingICMP)
|
|
if err != nil {
|
|
t.Fatalf("ICMP ping failure: %v", err)
|
|
}
|
|
}
|
|
|
|
pingRes, err := ping(ctx, t, clients[0], sts[1].Self.TailscaleIPs[0], tailcfg.PingDisco)
|
|
if err != nil {
|
|
t.Logf("ping failure: %v", err)
|
|
}
|
|
nt.gotRoute = classifyPing(pingRes)
|
|
t.Logf("ping route: %v", nt.gotRoute)
|
|
|
|
return nt.gotRoute
|
|
}
|
|
|
|
func classifyPing(pr *ipnstate.PingResult) pingRoute {
|
|
if pr == nil {
|
|
return routeNil
|
|
}
|
|
if pr.Endpoint != "" {
|
|
ap, err := netip.ParseAddrPort(pr.Endpoint)
|
|
if err == nil {
|
|
if ap.Addr().IsPrivate() {
|
|
return routeLocal
|
|
}
|
|
return routeDirect
|
|
}
|
|
}
|
|
return routeDERP // presumably
|
|
}
|
|
|
|
type pingRoute string
|
|
|
|
const (
|
|
routeDERP pingRoute = "derp"
|
|
routeLocal pingRoute = "local"
|
|
routeDirect pingRoute = "direct"
|
|
routeNil pingRoute = "nil" // *ipnstate.PingResult is nil
|
|
)
|
|
|
|
func ping(ctx context.Context, t testing.TB, c *vnet.NodeAgentClient, target netip.Addr, pType tailcfg.PingType) (*ipnstate.PingResult, error) {
|
|
var lastRes *ipnstate.PingResult
|
|
for n := range 10 {
|
|
t.Logf("ping attempt %d to %v ...", n+1, target)
|
|
pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
|
pr, err := c.PingWithOpts(pingCtx, target, pType, tailscale.PingOpts{})
|
|
cancel()
|
|
if err != nil {
|
|
t.Logf("ping attempt %d error: %v", n+1, err)
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
continue
|
|
}
|
|
if pr.Err != "" {
|
|
return nil, errors.New(pr.Err)
|
|
}
|
|
t.Logf("ping attempt %d: derp=%d endpoint=%v latency=%v", n+1, pr.DERPRegionID, pr.Endpoint, pr.LatencySeconds)
|
|
if pr.DERPRegionID == 0 {
|
|
return pr, nil
|
|
}
|
|
lastRes = pr
|
|
select {
|
|
case <-ctx.Done():
|
|
return lastRes, nil
|
|
case <-time.After(time.Second):
|
|
}
|
|
}
|
|
if lastRes != nil {
|
|
return lastRes, nil
|
|
}
|
|
return nil, fmt.Errorf("no ping response (ctx: %v)", ctx.Err())
|
|
}
|
|
|
|
// qmpQueryStatus connects to a QEMU QMP socket and returns the VM status
|
|
// (e.g. "running", "paused", "prelaunch") or an error string.
|
|
func qmpQueryStatus(sockPath string) string {
|
|
conn, err := net.DialTimeout("unix", sockPath, 2*time.Second)
|
|
if err != nil {
|
|
return fmt.Sprintf("dial error: %v", err)
|
|
}
|
|
defer conn.Close()
|
|
conn.SetDeadline(time.Now().Add(5 * time.Second))
|
|
dec := json.NewDecoder(conn)
|
|
|
|
// Read QMP greeting.
|
|
var greeting json.RawMessage
|
|
if err := dec.Decode(&greeting); err != nil {
|
|
return fmt.Sprintf("greeting error: %v", err)
|
|
}
|
|
|
|
// Enter command mode.
|
|
if _, err := conn.Write([]byte(`{"execute":"qmp_capabilities"}` + "\n")); err != nil {
|
|
return fmt.Sprintf("write caps: %v", err)
|
|
}
|
|
var capsResp json.RawMessage
|
|
if err := dec.Decode(&capsResp); err != nil {
|
|
return fmt.Sprintf("caps response: %v", err)
|
|
}
|
|
|
|
// Query status.
|
|
if _, err := conn.Write([]byte(`{"execute":"query-status"}` + "\n")); err != nil {
|
|
return fmt.Sprintf("write query-status: %v", err)
|
|
}
|
|
var statusResp struct {
|
|
Return struct {
|
|
Running bool `json:"running"`
|
|
Status string `json:"status"`
|
|
} `json:"return"`
|
|
Error *struct {
|
|
Class string `json:"class"`
|
|
Desc string `json:"desc"`
|
|
} `json:"error"`
|
|
}
|
|
if err := dec.Decode(&statusResp); err != nil {
|
|
return fmt.Sprintf("status response: %v", err)
|
|
}
|
|
if statusResp.Error != nil {
|
|
return fmt.Sprintf("qmp error: %s: %s", statusResp.Error.Class, statusResp.Error.Desc)
|
|
}
|
|
return fmt.Sprintf("status=%s running=%v", statusResp.Return.Status, statusResp.Return.Running)
|
|
}
|
|
|
|
func up(ctx context.Context, c *vnet.NodeAgentClient) error {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/up", nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
res, err := c.HTTPClient.Do(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer res.Body.Close()
|
|
all, _ := io.ReadAll(res.Body)
|
|
if res.StatusCode != 200 {
|
|
return fmt.Errorf("unexpected status code %v: %s", res.Status, all)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getClientIP(ctx context.Context, c *vnet.NodeAgentClient) (netip.Addr, error) {
|
|
getIPReq, err := http.NewRequestWithContext(ctx, "GET", "http://unused/ip", nil)
|
|
if err != nil {
|
|
return netip.Addr{}, err
|
|
}
|
|
res, err := c.HTTPClient.Do(getIPReq)
|
|
if err != nil {
|
|
return netip.Addr{}, err
|
|
}
|
|
defer res.Body.Close()
|
|
if res.StatusCode != http.StatusOK {
|
|
return netip.Addr{}, fmt.Errorf("client returned http status %q", res.Status)
|
|
}
|
|
ipBytes, err := io.ReadAll(res.Body)
|
|
if err != nil {
|
|
return netip.Addr{}, err
|
|
}
|
|
addrPort, err := netip.ParseAddrPort(string(ipBytes))
|
|
if err != nil {
|
|
return netip.Addr{}, err
|
|
}
|
|
return addrPort.Addr(), nil
|
|
}
|
|
|
|
// sendHostNetworkPing pings toClient from fromClient, and returns whether
|
|
// toClient responded to the ping.
|
|
func sendHostNetworkPing(ctx context.Context, tb testing.TB, fromClient, toClient *vnet.NodeAgentClient) (bool, error) {
|
|
toIP, err := getClientIP(ctx, toClient)
|
|
if err != nil {
|
|
return false, fmt.Errorf("get ip: %w", err)
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://unused/ping?host=%s", toIP.String()), nil)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
res, err := fromClient.HTTPClient.Do(req)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
defer res.Body.Close()
|
|
got, err := io.ReadAll(res.Body)
|
|
if err != nil {
|
|
tb.Logf("error while reading http body: %v", err)
|
|
} else {
|
|
tb.Logf("got response from ping: %q", got)
|
|
}
|
|
ec, err := strconv.Atoi(res.Header.Get("Exec-Exit-Code"))
|
|
if err != nil {
|
|
return false, fmt.Errorf("parse exit code: %w", err)
|
|
}
|
|
tb.Logf("got ec: %v", ec)
|
|
return ec == 0, nil
|
|
}
|
|
|
|
type nodeType struct {
|
|
name string
|
|
fn addNodeFunc
|
|
}
|
|
|
|
var types = []nodeType{
|
|
{"easy", easy},
|
|
{"easyAF", easyAF},
|
|
{"hard", hard},
|
|
{"easyPMP", easyPMP},
|
|
{"hardPMP", hardPMP},
|
|
{"one2one", one2one},
|
|
{"sameLAN", sameLAN},
|
|
{"cgnat", cgnatNoTailnet},
|
|
}
|
|
|
|
// want sets the expected ping route for the test.
|
|
func (nt *natTest) want(r pingRoute) {
|
|
if nt.gotRoute != r {
|
|
nt.tb.Errorf("ping route = %v; want %v", nt.gotRoute, r)
|
|
}
|
|
}
|
|
|
|
func TestEasyEasy(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easy, easy)
|
|
nt.want(routeDirect)
|
|
}
|
|
|
|
func TestTwoEasyNoControlDiscoRotate(t *testing.T) {
|
|
envknob.Setenv("TS_USE_CACHED_NETMAP", "1")
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easyNoControlDiscoRotate, easyNoControlDiscoRotate)
|
|
nt.want(routeDirect)
|
|
}
|
|
|
|
func cgnatNoTailnet(c *vnet.Config) *vnet.Node {
|
|
n := c.NumNodes() + 1
|
|
return c.AddNode(c.AddNetwork(
|
|
fmt.Sprintf("100.65.%d.1/16", n),
|
|
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
|
|
vnet.EasyNAT),
|
|
vnet.DontJoinTailnet)
|
|
}
|
|
|
|
func TestNonTailscaleCGNATEndpoint(t *testing.T) {
|
|
if !*knownBroken {
|
|
t.Skip("skipping known-broken test; set --known-broken to run; see https://github.com/tailscale/corp/issues/36270")
|
|
}
|
|
nt := newNatTest(t)
|
|
if !nt.runHostConnectivityTest(cgnatNoTailnet, sameLAN) {
|
|
t.Fatalf("could not ping")
|
|
}
|
|
}
|
|
|
|
// Issue tailscale/corp#26438: use learned DERP route as send path of last
|
|
// resort
|
|
//
|
|
// See (*magicsock.Conn).fallbackDERPRegionForPeer and its comment for
|
|
// background.
|
|
//
|
|
// This sets up a test with two nodes that must use DERP to communicate but the
|
|
// target of the ping (the second node) additionally is not getting DERP or
|
|
// Endpoint updates from the control plane. (Or rather, it's getting them but is
|
|
// configured to scrub them right when they come off the network before being
|
|
// processed) This then tests whether node2, upon receiving a packet, will be
|
|
// able to reply to node1 since it knows neither node1's endpoints nor its home
|
|
// DERP. The only reply route it can use is that fact that it just received a
|
|
// packet over a particular DERP from that peer.
|
|
func TestFallbackDERPRegionForPeer(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(hard, hardNoDERPOrEndoints)
|
|
nt.want(routeDERP)
|
|
}
|
|
|
|
func TestSingleJustIPv6(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(just6)
|
|
}
|
|
|
|
var knownBroken = flag.Bool("known-broken", false, "run known-broken tests")
|
|
|
|
// TestSingleDualStackButBrokenIPv4 tests a dual-stack node with broken
|
|
// (blackholed) IPv4.
|
|
//
|
|
// See https://github.com/tailscale/tailscale/issues/13346
|
|
func TestSingleDualBrokenIPv4(t *testing.T) {
|
|
if !*knownBroken {
|
|
t.Skip("skipping known-broken test; set --known-broken to run; see https://github.com/tailscale/tailscale/issues/13346")
|
|
}
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(v6AndBlackholedIPv4)
|
|
}
|
|
|
|
func TestJustIPv6(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(just6, just6)
|
|
nt.want(routeDirect)
|
|
}
|
|
|
|
func TestEasy4AndJust6(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easyAnd6, just6)
|
|
nt.want(routeDirect)
|
|
}
|
|
|
|
func TestSameLAN(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easy, sameLAN)
|
|
nt.want(routeLocal)
|
|
}
|
|
|
|
// TestBPFDisco tests https://github.com/tailscale/tailscale/issues/3824 ...
|
|
// * server behind a Hard NAT
|
|
// * client behind a NAT with UPnP support
|
|
// * client machine has a stateful host firewall (e.g. ufw)
|
|
func TestBPFDisco(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easyPMPFWPlusBPF, hard)
|
|
nt.want(routeDirect)
|
|
}
|
|
|
|
func TestHostFWNoBPF(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easyPMPFWNoBPF, hard)
|
|
nt.want(routeDERP)
|
|
}
|
|
|
|
func TestHostFWPair(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easyFW, easyFW)
|
|
nt.want(routeDirect)
|
|
}
|
|
|
|
func TestOneHostFW(t *testing.T) {
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(easy, easyFW)
|
|
nt.want(routeDirect)
|
|
}
|
|
|
|
var pair = flag.String("pair", "", "comma-separated pair of types to test (easy, easyAF, hard, easyPMP, hardPMP, one2one, sameLAN)")
|
|
|
|
func TestPair(t *testing.T) {
|
|
t1, t2, ok := strings.Cut(*pair, ",")
|
|
if !ok {
|
|
t.Skipf("skipping test without --pair=type1,type2 set")
|
|
}
|
|
find := func(name string) addNodeFunc {
|
|
for _, nt := range types {
|
|
if nt.name == name {
|
|
return nt.fn
|
|
}
|
|
}
|
|
t.Fatalf("unknown type %q", name)
|
|
return nil
|
|
}
|
|
|
|
nt := newNatTest(t)
|
|
nt.runTailscaleConnectivityTest(find(t1), find(t2))
|
|
}
|
|
|
|
var runGrid = flag.Bool("run-grid", false, "run grid test")
|
|
|
|
func TestGrid(t *testing.T) {
|
|
if !*runGrid {
|
|
t.Skip("skipping grid test; set --run-grid to run")
|
|
}
|
|
t.Parallel()
|
|
|
|
sem := syncs.NewSemaphore(2)
|
|
var (
|
|
mu sync.Mutex
|
|
res = make(map[string]pingRoute)
|
|
)
|
|
for _, a := range types {
|
|
for _, b := range types {
|
|
key := a.name + "-" + b.name
|
|
keyBack := b.name + "-" + a.name
|
|
t.Run(key, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
sem.Acquire()
|
|
defer sem.Release()
|
|
|
|
filename := key + ".cache"
|
|
contents, _ := os.ReadFile(filename)
|
|
if len(contents) == 0 {
|
|
filename2 := keyBack + ".cache"
|
|
contents, _ = os.ReadFile(filename2)
|
|
}
|
|
route := pingRoute(strings.TrimSpace(string(contents)))
|
|
|
|
if route == "" {
|
|
nt := newNatTest(t)
|
|
route = nt.runTailscaleConnectivityTest(a.fn, b.fn)
|
|
if err := os.WriteFile(filename, []byte(string(route)), 0666); err != nil {
|
|
t.Fatalf("writeFile: %v", err)
|
|
}
|
|
}
|
|
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
res[key] = route
|
|
t.Logf("results: %v", res)
|
|
})
|
|
}
|
|
}
|
|
|
|
t.Cleanup(func() {
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
var hb bytes.Buffer
|
|
pf := func(format string, args ...any) {
|
|
fmt.Fprintf(&hb, format, args...)
|
|
}
|
|
rewrite := func(s string) string {
|
|
return strings.ReplaceAll(s, "PMP", "+pm")
|
|
}
|
|
pf("<html><table border=1 cellpadding=5>")
|
|
pf("<tr><td></td>")
|
|
for _, a := range types {
|
|
pf("<td><b>%s</b></td>", rewrite(a.name))
|
|
}
|
|
pf("</tr>\n")
|
|
|
|
for _, a := range types {
|
|
if a.name == "sameLAN" {
|
|
continue
|
|
}
|
|
pf("<tr><td><b>%s</b></td>", rewrite(a.name))
|
|
for _, b := range types {
|
|
key := a.name + "-" + b.name
|
|
key2 := b.name + "-" + a.name
|
|
v := cmp.Or(res[key], res[key2], "-")
|
|
if v == "derp" {
|
|
pf("<td><div style='color: red; font-weight: bold'>%s</div></td>", v)
|
|
} else if v == "local" {
|
|
pf("<td><div style='color: green; font-weight: bold'>%s</div></td>", v)
|
|
} else {
|
|
pf("<td>%s</td>", v)
|
|
}
|
|
}
|
|
pf("</tr>\n")
|
|
}
|
|
pf("</table>")
|
|
pf("<b>easy</b>: Endpoint-Independent Mapping, Address and Port-Dependent Filtering (e.g. Linux, Google Wifi, Unifi, eero)<br>")
|
|
pf("<b>easyAF</b>: Endpoint-Independent Mapping, Address-Dependent Filtering (James says telephony things or Zyxel type things)<br>")
|
|
pf("<b>hard</b>: Address and Port-Dependent Mapping, Address and Port-Dependent Filtering (FreeBSD, OPNSense, pfSense)<br>")
|
|
pf("<b>one2one</b>: One-to-One NAT (e.g. an EC2 instance with a public IPv4)<br>")
|
|
pf("<b>x+pm</b>: x, with port mapping (NAT-PMP, PCP, UPnP, etc)<br>")
|
|
pf("<b>sameLAN</b>: a second node in the same LAN as the first<br>")
|
|
pf("</html>")
|
|
|
|
if err := os.WriteFile("grid.html", hb.Bytes(), 0666); err != nil {
|
|
t.Fatalf("writeFile: %v", err)
|
|
}
|
|
})
|
|
}
|