Files
tailscale/tstest/integration/nat/nat_test.go
T
Brad Fitzpatrick 7dcb378875 tstest/integration/nat, tstest/natlab/vnet: fix natlab test flake
The natlab-integrationtest CI job frequently flakes by exhausting its
3m go test timeout. The root cause is that the QEMU VMs run under
pure software emulation (TCG) with no KVM. Under TCG, the guest
kernel's timer calibration busy-loops are at the mercy of host CPU
scheduling. When two VMs boot simultaneously on a 2-core CI runner,
one VM's calibration gets starved and produces wrong results, leaving
the kernel with broken timers that prevent it from ever completing
boot — even after the other VM finishes and frees up CPU.

Additionally, the microvm machine type doesn't provide HPET hardware,
but the kernel command line specified clocksource=hpet. And the VM
image build (make natlab) ran inside the test itself, consuming most
of the 3m timeout budget before the actual test started.

Fix by:

 - Enabling KVM when /dev/kvm is available, so timer calibration
   uses real hardware timers unaffected by host CPU scheduling.

 - Adding a CI step to set /dev/kvm permissions on the GitHub
   Actions runner (ubuntu-latest provides KVM but needs a udev rule).

 - Pre-building the VM image in a separate CI step so it doesn't
   cut into the go test -timeout budget.

 - Replacing the hardcoded 60s context timeout with one derived from
   t.Deadline(), so the test uses the full -timeout budget.

 - Adding VM boot progress detection (AwaitFirstPacket) and QMP
   diagnostics, so boot failures produce clear errors instead of
   opaque "context deadline exceeded" messages.

With KVM enabled, the test passes reliably even on a single CPU core
with 3 parallel workers — a scenario that was 100% broken under TCG.

Fixes #18906

Change-Id: I4c87631a9c9678d185b9f30cb05c0f7bfa9f5c62
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2026-04-13 16:34:15 -07:00

1003 lines
26 KiB
Go

// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package nat
import (
"bytes"
"cmp"
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"net"
"net/http"
"net/netip"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"testing"
"time"
"golang.org/x/mod/modfile"
"golang.org/x/sync/errgroup"
"tailscale.com/client/tailscale"
"tailscale.com/envknob"
"tailscale.com/ipn/ipnstate"
"tailscale.com/syncs"
"tailscale.com/tailcfg"
"tailscale.com/tstest/natlab/vnet"
)
var (
runVMTests = flag.Bool("run-vm-tests", false, "run tests that require a VM")
logTailscaled = flag.Bool("log-tailscaled", false, "log tailscaled output")
pcapFile = flag.String("pcap", "", "write pcap to file")
)
type natTest struct {
tb testing.TB
base string // base image
tempDir string // for qcow2 images
vnet *vnet.Server
kernel string // linux kernel path
gotRoute pingRoute
}
func newNatTest(tb testing.TB) *natTest {
root, err := os.Getwd()
if err != nil {
tb.Fatal(err)
}
modRoot := filepath.Join(root, "../../..")
nt := &natTest{
tb: tb,
tempDir: tb.TempDir(),
base: filepath.Join(modRoot, "gokrazy/natlabapp.qcow2"),
}
if !*runVMTests {
tb.Skip("skipping heavy test; set --run-vm-tests to run")
}
if _, err := os.Stat(nt.base); err != nil {
if !os.IsNotExist(err) {
tb.Fatal(err)
}
tb.Logf("building VM image...")
cmd := exec.Command("make", "natlab")
cmd.Dir = filepath.Join(modRoot, "gokrazy")
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout
if err := cmd.Run(); err != nil {
tb.Fatalf("Error running 'make natlab' in gokrazy directory: %v", err)
}
if _, err := os.Stat(nt.base); err != nil {
tb.Skipf("still can't find VM image: %v", err)
}
}
nt.kernel, err = findKernelPath(filepath.Join(modRoot, "go.mod"))
if err != nil {
tb.Skipf("skipping test; kernel not found: %v", err)
}
tb.Logf("found kernel: %v", nt.kernel)
return nt
}
func findKernelPath(goMod string) (string, error) {
b, err := os.ReadFile(goMod)
if err != nil {
return "", err
}
mf, err := modfile.Parse("go.mod", b, nil)
if err != nil {
return "", err
}
goModB, err := exec.Command("go", "env", "GOMODCACHE").CombinedOutput()
if err != nil {
return "", err
}
for _, r := range mf.Require {
if r.Mod.Path == "github.com/tailscale/gokrazy-kernel" {
return strings.TrimSpace(string(goModB)) + "/" + r.Mod.String() + "/vmlinuz", nil
}
}
return "", fmt.Errorf("failed to find kernel in %v", goMod)
}
type addNodeFunc func(c *vnet.Config) *vnet.Node // returns nil to omit test
func v6cidr(n int) string {
return fmt.Sprintf("2000:%d::1/64", n)
}
func easy(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT))
}
func easyAnd6(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n),
v6cidr(n),
vnet.EasyNAT))
}
// easyNoControlDiscoRotate sets up a node with easy NAT, cuts traffic to
// control after connecting, and then rotates the disco key to simulate a newly
// started node (from a disco perspective).
func easyNoControlDiscoRotate(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
nw := c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n),
vnet.EasyNAT)
nw.SetPostConnectControlBlackhole(true)
return c.AddNode(
vnet.TailscaledEnv{
Key: "TS_USE_CACHED_NETMAP",
Value: "true",
},
vnet.RotateDisco, vnet.PreICMPPing, nw)
}
func v6AndBlackholedIPv4(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
nw := c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n),
v6cidr(n),
vnet.EasyNAT)
nw.SetBlackholedIPv4(true)
return c.AddNode(nw)
}
func just6(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(v6cidr(n))) // public IPv6 prefix
}
// easy + host firewall
func easyFW(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(vnet.HostFirewall, c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT))
}
func easyAF(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyAFNAT))
}
func sameLAN(c *vnet.Config) *vnet.Node {
nw := c.FirstNetwork()
if nw == nil {
return nil
}
if !nw.CanTakeMoreNodes() {
return nil
}
return c.AddNode(nw)
}
func one2one(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("172.16.%d.1/24", n), vnet.One2OneNAT))
}
func easyPMP(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT, vnet.NATPMP))
}
// easy + port mapping + host firewall + BPF
func easyPMPFWPlusBPF(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(
vnet.HostFirewall,
vnet.TailscaledEnv{
Key: "TS_ENABLE_RAW_DISCO",
Value: "true",
},
vnet.TailscaledEnv{
Key: "TS_DEBUG_RAW_DISCO",
Value: "1",
},
vnet.TailscaledEnv{
Key: "TS_DEBUG_DISCO",
Value: "1",
},
vnet.TailscaledEnv{
Key: "TS_LOG_VERBOSITY",
Value: "2",
},
c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT, vnet.NATPMP))
}
// easy + port mapping + host firewall - BPF
func easyPMPFWNoBPF(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(
vnet.HostFirewall,
vnet.TailscaledEnv{
Key: "TS_ENABLE_RAW_DISCO",
Value: "false",
},
c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT, vnet.NATPMP))
}
func hard(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("10.0.%d.1/24", n), vnet.HardNAT))
}
func hardNoDERPOrEndoints(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("10.0.%d.1/24", n), vnet.HardNAT),
vnet.TailscaledEnv{
Key: "TS_DEBUG_STRIP_ENDPOINTS",
Value: "1",
},
vnet.TailscaledEnv{
Key: "TS_DEBUG_STRIP_HOME_DERP",
Value: "1",
},
)
}
func hardPMP(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("10.7.%d.1/24", n), vnet.HardNAT, vnet.NATPMP))
}
func (nt *natTest) setupTest(ctx context.Context, addNode ...addNodeFunc) (nodes []*vnet.Node, clients []*vnet.NodeAgentClient, cleanup func()) {
if len(addNode) < 1 || len(addNode) > 2 {
nt.tb.Fatalf("runTest: invalid number of nodes %v; want 1 or 2", len(addNode))
}
t := nt.tb
var c vnet.Config
c.SetPCAPFile(*pcapFile)
for _, fn := range addNode {
node := fn(&c)
if node == nil {
t.Skip("skipping test; not applicable combination")
}
nodes = append(nodes, node)
if *logTailscaled {
node.SetVerboseSyslog(true)
}
}
var err error
nt.vnet, err = vnet.New(&c)
if err != nil {
t.Fatalf("newServer: %v", err)
}
nt.tb.Cleanup(func() {
nt.vnet.Close()
})
var wg sync.WaitGroup // waiting for srv.Accept goroutine
defer wg.Wait()
sockAddr := filepath.Join(nt.tempDir, "qemu.sock")
srv, err := net.Listen("unix", sockAddr)
if err != nil {
t.Fatalf("Listen: %v", err)
}
defer srv.Close()
wg.Go(func() {
for {
c, err := srv.Accept()
if err != nil {
return
}
go nt.vnet.ServeUnixConn(c.(*net.UnixConn), vnet.ProtocolQEMU)
}
})
haveKVM := false
if runtime.GOOS == "linux" {
if f, err := os.OpenFile("/dev/kvm", os.O_RDWR, 0); err == nil {
f.Close()
haveKVM = true
}
}
qmpSocks := make([]string, len(nodes))
for i, node := range nodes {
disk := fmt.Sprintf("%s/node-%d.qcow2", nt.tempDir, i)
out, err := exec.Command("qemu-img", "create",
"-f", "qcow2",
"-F", "qcow2",
"-b", nt.base,
disk).CombinedOutput()
if err != nil {
t.Fatalf("qemu-img create: %v, %s", err, out)
}
var envBuf bytes.Buffer
for _, e := range node.Env() {
fmt.Fprintf(&envBuf, " tailscaled.env=%s=%s", e.Key, e.Value)
}
sysLogAddr := net.JoinHostPort(vnet.FakeSyslogIPv4().String(), "995")
if node.IsV6Only() {
fmt.Fprintf(&envBuf, " tta.nameserver=%s", vnet.FakeDNSIPv6())
sysLogAddr = net.JoinHostPort(vnet.FakeSyslogIPv6().String(), "995")
}
envStr := envBuf.String()
qmpSocks[i] = fmt.Sprintf("%s/qmp-node-%d.sock", nt.tempDir, i)
qemuArgs := []string{
"-M", "microvm,isa-serial=off",
"-m", "384M",
"-nodefaults", "-no-user-config", "-nographic",
"-kernel", nt.kernel,
"-append", "console=hvc0 root=PARTUUID=60c24cc1-f3f9-427a-8199-76baa2d60001/PARTNROFF=1 ro init=/gokrazy/init panic=10 oops=panic pci=off nousb gokrazy.remote_syslog.target=" + sysLogAddr + " tailscale-tta=1" + envStr,
"-drive", "id=blk0,file=" + disk + ",format=qcow2",
"-device", "virtio-blk-device,drive=blk0",
"-netdev", "stream,id=net0,addr.type=unix,addr.path=" + sockAddr,
"-device", "virtio-serial-device",
"-device", "virtio-rng-device",
"-device", "virtio-net-device,netdev=net0,mac=" + node.MAC().String(),
"-chardev", "stdio,id=virtiocon0,mux=on",
"-device", "virtconsole,chardev=virtiocon0",
"-mon", "chardev=virtiocon0,mode=readline",
"-qmp", "unix:" + qmpSocks[i] + ",server=on,wait=off",
}
if haveKVM {
qemuArgs = append(qemuArgs, "-enable-kvm", "-cpu", "host")
}
cmd := exec.Command("qemu-system-x86_64", qemuArgs...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
t.Fatalf("qemu: %v", err)
}
nt.tb.Cleanup(func() {
cmd.Process.Kill()
cmd.Wait()
})
}
for i, node := range nodes {
if err := nt.vnet.AwaitFirstPacket(ctx, node.MAC()); err != nil {
t.Logf("node %v: no boot progress (no packets received): %v", node, err)
t.Logf("node %v: QMP status: %s", node, qmpQueryStatus(qmpSocks[i]))
t.FailNow()
}
t.Logf("node %v: boot detected (first packet received)", node)
}
for _, n := range nodes {
client := nt.vnet.NodeAgentClient(n)
n.SetClient(client)
clients = append(clients, client)
}
var eg errgroup.Group
for i, c := range clients {
eg.Go(func() error {
node := nodes[i]
t.Logf("%v calling Status...", node)
st, err := c.Status(ctx)
if err != nil {
return fmt.Errorf("%v status: %w", node, err)
}
t.Logf("%v status: %v", node, st.BackendState)
if node.HostFirewall() {
if err := c.EnableHostFirewall(ctx); err != nil {
return fmt.Errorf("%v firewall: %w", node, err)
}
t.Logf("%v firewalled", node)
}
if node.ShouldJoinTailnet() {
if err := up(ctx, c); err != nil {
return fmt.Errorf("%v up: %w", node, err)
}
t.Logf("%v up!", node)
st, err = c.Status(ctx)
if err != nil {
return fmt.Errorf("%v status: %w", node, err)
}
if st.BackendState != "Running" {
return fmt.Errorf("%v state = %q", node, st.BackendState)
}
t.Logf("%v AllowedIPs: %v", node, st.Self.Addrs)
t.Logf("%v up with %v", node, st.Self.TailscaleIPs)
} else {
t.Logf("%v skipping joining tailnet", node)
}
return nil
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("initial setup: %v", err)
}
return nodes, clients, nt.vnet.Close
}
type hasDeadline interface {
Deadline() (deadline time.Time, ok bool)
}
// testContext returns a context derived from the test's deadline (from -timeout),
// leaving a small margin for cleanup. Falls back to 60s if no deadline is set.
func testContext(tb testing.TB) (context.Context, context.CancelFunc) {
if t, ok := tb.(hasDeadline); ok {
if dl, ok := t.Deadline(); ok {
const margin = 5 * time.Second
return context.WithDeadline(context.Background(), dl.Add(-margin))
}
}
return context.WithTimeout(context.Background(), 60*time.Second)
}
func (nt *natTest) runHostConnectivityTest(addNode ...addNodeFunc) bool {
ctx, cancel := testContext(nt.tb)
defer cancel()
nodes, clients, cleanup := nt.setupTest(ctx, addNode...)
defer cleanup()
if len(nodes) != 2 {
nt.tb.Logf("ping can only be done among exactly two nodes")
return false
}
var fromClient, toClient *vnet.NodeAgentClient
for i, n := range nodes {
if n.ShouldJoinTailnet() && fromClient == nil {
fromClient = clients[i]
} else {
toClient = clients[i]
}
}
got, err := sendHostNetworkPing(ctx, nt.tb, fromClient, toClient)
if err != nil {
nt.tb.Fatalf("ping host: %v", err)
}
nt.tb.Logf("ping success: %v", got)
return got
}
func (nt *natTest) runTailscaleConnectivityTest(addNode ...addNodeFunc) pingRoute {
ctx, cancel := testContext(nt.tb)
defer cancel()
nodes, clients, cleanup := nt.setupTest(ctx, addNode...)
defer cleanup()
t := nt.tb
if len(nodes) < 2 {
return ""
}
for _, n := range nodes {
if !n.ShouldJoinTailnet() {
t.Logf("%v did not join tailnet", n)
return ""
}
}
sts := make([]*ipnstate.Status, len(nodes))
var eg errgroup.Group
for i, c := range clients {
eg.Go(func() error {
node := nodes[i]
st, err := c.Status(ctx)
if err != nil {
return fmt.Errorf("%v: %w", node, err)
}
sts[i] = st
return nil
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("get node statuses: %v", err)
}
preICMPPing := false
for _, node := range nodes {
node.Network().PostConnectedToControl()
if err := node.PostConnectedToControl(ctx); err != nil {
t.Fatalf("post control error: %s", err)
}
if node.PreICMPPing() {
preICMPPing = true
}
}
// Should we send traffic across the nodes before starting disco?
// For nodes that rotated disco keys after control going away.
if preICMPPing {
_, err := ping(ctx, t, clients[0], sts[1].Self.TailscaleIPs[0], tailcfg.PingICMP)
if err != nil {
t.Fatalf("ICMP ping failure: %v", err)
}
}
pingRes, err := ping(ctx, t, clients[0], sts[1].Self.TailscaleIPs[0], tailcfg.PingDisco)
if err != nil {
t.Logf("ping failure: %v", err)
}
nt.gotRoute = classifyPing(pingRes)
t.Logf("ping route: %v", nt.gotRoute)
return nt.gotRoute
}
func classifyPing(pr *ipnstate.PingResult) pingRoute {
if pr == nil {
return routeNil
}
if pr.Endpoint != "" {
ap, err := netip.ParseAddrPort(pr.Endpoint)
if err == nil {
if ap.Addr().IsPrivate() {
return routeLocal
}
return routeDirect
}
}
return routeDERP // presumably
}
type pingRoute string
const (
routeDERP pingRoute = "derp"
routeLocal pingRoute = "local"
routeDirect pingRoute = "direct"
routeNil pingRoute = "nil" // *ipnstate.PingResult is nil
)
func ping(ctx context.Context, t testing.TB, c *vnet.NodeAgentClient, target netip.Addr, pType tailcfg.PingType) (*ipnstate.PingResult, error) {
var lastRes *ipnstate.PingResult
for n := range 10 {
t.Logf("ping attempt %d to %v ...", n+1, target)
pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
pr, err := c.PingWithOpts(pingCtx, target, pType, tailscale.PingOpts{})
cancel()
if err != nil {
t.Logf("ping attempt %d error: %v", n+1, err)
if ctx.Err() != nil {
break
}
continue
}
if pr.Err != "" {
return nil, errors.New(pr.Err)
}
t.Logf("ping attempt %d: derp=%d endpoint=%v latency=%v", n+1, pr.DERPRegionID, pr.Endpoint, pr.LatencySeconds)
if pr.DERPRegionID == 0 {
return pr, nil
}
lastRes = pr
select {
case <-ctx.Done():
return lastRes, nil
case <-time.After(time.Second):
}
}
if lastRes != nil {
return lastRes, nil
}
return nil, fmt.Errorf("no ping response (ctx: %v)", ctx.Err())
}
// qmpQueryStatus connects to a QEMU QMP socket and returns the VM status
// (e.g. "running", "paused", "prelaunch") or an error string.
func qmpQueryStatus(sockPath string) string {
conn, err := net.DialTimeout("unix", sockPath, 2*time.Second)
if err != nil {
return fmt.Sprintf("dial error: %v", err)
}
defer conn.Close()
conn.SetDeadline(time.Now().Add(5 * time.Second))
dec := json.NewDecoder(conn)
// Read QMP greeting.
var greeting json.RawMessage
if err := dec.Decode(&greeting); err != nil {
return fmt.Sprintf("greeting error: %v", err)
}
// Enter command mode.
if _, err := conn.Write([]byte(`{"execute":"qmp_capabilities"}` + "\n")); err != nil {
return fmt.Sprintf("write caps: %v", err)
}
var capsResp json.RawMessage
if err := dec.Decode(&capsResp); err != nil {
return fmt.Sprintf("caps response: %v", err)
}
// Query status.
if _, err := conn.Write([]byte(`{"execute":"query-status"}` + "\n")); err != nil {
return fmt.Sprintf("write query-status: %v", err)
}
var statusResp struct {
Return struct {
Running bool `json:"running"`
Status string `json:"status"`
} `json:"return"`
Error *struct {
Class string `json:"class"`
Desc string `json:"desc"`
} `json:"error"`
}
if err := dec.Decode(&statusResp); err != nil {
return fmt.Sprintf("status response: %v", err)
}
if statusResp.Error != nil {
return fmt.Sprintf("qmp error: %s: %s", statusResp.Error.Class, statusResp.Error.Desc)
}
return fmt.Sprintf("status=%s running=%v", statusResp.Return.Status, statusResp.Return.Running)
}
func up(ctx context.Context, c *vnet.NodeAgentClient) error {
req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/up", nil)
if err != nil {
return err
}
res, err := c.HTTPClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
all, _ := io.ReadAll(res.Body)
if res.StatusCode != 200 {
return fmt.Errorf("unexpected status code %v: %s", res.Status, all)
}
return nil
}
func getClientIP(ctx context.Context, c *vnet.NodeAgentClient) (netip.Addr, error) {
getIPReq, err := http.NewRequestWithContext(ctx, "GET", "http://unused/ip", nil)
if err != nil {
return netip.Addr{}, err
}
res, err := c.HTTPClient.Do(getIPReq)
if err != nil {
return netip.Addr{}, err
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return netip.Addr{}, fmt.Errorf("client returned http status %q", res.Status)
}
ipBytes, err := io.ReadAll(res.Body)
if err != nil {
return netip.Addr{}, err
}
addrPort, err := netip.ParseAddrPort(string(ipBytes))
if err != nil {
return netip.Addr{}, err
}
return addrPort.Addr(), nil
}
// sendHostNetworkPing pings toClient from fromClient, and returns whether
// toClient responded to the ping.
func sendHostNetworkPing(ctx context.Context, tb testing.TB, fromClient, toClient *vnet.NodeAgentClient) (bool, error) {
toIP, err := getClientIP(ctx, toClient)
if err != nil {
return false, fmt.Errorf("get ip: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://unused/ping?host=%s", toIP.String()), nil)
if err != nil {
return false, err
}
res, err := fromClient.HTTPClient.Do(req)
if err != nil {
return false, err
}
defer res.Body.Close()
got, err := io.ReadAll(res.Body)
if err != nil {
tb.Logf("error while reading http body: %v", err)
} else {
tb.Logf("got response from ping: %q", got)
}
ec, err := strconv.Atoi(res.Header.Get("Exec-Exit-Code"))
if err != nil {
return false, fmt.Errorf("parse exit code: %w", err)
}
tb.Logf("got ec: %v", ec)
return ec == 0, nil
}
type nodeType struct {
name string
fn addNodeFunc
}
var types = []nodeType{
{"easy", easy},
{"easyAF", easyAF},
{"hard", hard},
{"easyPMP", easyPMP},
{"hardPMP", hardPMP},
{"one2one", one2one},
{"sameLAN", sameLAN},
{"cgnat", cgnatNoTailnet},
}
// want sets the expected ping route for the test.
func (nt *natTest) want(r pingRoute) {
if nt.gotRoute != r {
nt.tb.Errorf("ping route = %v; want %v", nt.gotRoute, r)
}
}
func TestEasyEasy(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easy, easy)
nt.want(routeDirect)
}
func TestTwoEasyNoControlDiscoRotate(t *testing.T) {
envknob.Setenv("TS_USE_CACHED_NETMAP", "1")
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easyNoControlDiscoRotate, easyNoControlDiscoRotate)
nt.want(routeDirect)
}
func cgnatNoTailnet(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("100.65.%d.1/16", n),
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
vnet.EasyNAT),
vnet.DontJoinTailnet)
}
func TestNonTailscaleCGNATEndpoint(t *testing.T) {
if !*knownBroken {
t.Skip("skipping known-broken test; set --known-broken to run; see https://github.com/tailscale/corp/issues/36270")
}
nt := newNatTest(t)
if !nt.runHostConnectivityTest(cgnatNoTailnet, sameLAN) {
t.Fatalf("could not ping")
}
}
// Issue tailscale/corp#26438: use learned DERP route as send path of last
// resort
//
// See (*magicsock.Conn).fallbackDERPRegionForPeer and its comment for
// background.
//
// This sets up a test with two nodes that must use DERP to communicate but the
// target of the ping (the second node) additionally is not getting DERP or
// Endpoint updates from the control plane. (Or rather, it's getting them but is
// configured to scrub them right when they come off the network before being
// processed) This then tests whether node2, upon receiving a packet, will be
// able to reply to node1 since it knows neither node1's endpoints nor its home
// DERP. The only reply route it can use is that fact that it just received a
// packet over a particular DERP from that peer.
func TestFallbackDERPRegionForPeer(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(hard, hardNoDERPOrEndoints)
nt.want(routeDERP)
}
func TestSingleJustIPv6(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(just6)
}
var knownBroken = flag.Bool("known-broken", false, "run known-broken tests")
// TestSingleDualStackButBrokenIPv4 tests a dual-stack node with broken
// (blackholed) IPv4.
//
// See https://github.com/tailscale/tailscale/issues/13346
func TestSingleDualBrokenIPv4(t *testing.T) {
if !*knownBroken {
t.Skip("skipping known-broken test; set --known-broken to run; see https://github.com/tailscale/tailscale/issues/13346")
}
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(v6AndBlackholedIPv4)
}
func TestJustIPv6(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(just6, just6)
nt.want(routeDirect)
}
func TestEasy4AndJust6(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easyAnd6, just6)
nt.want(routeDirect)
}
func TestSameLAN(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easy, sameLAN)
nt.want(routeLocal)
}
// TestBPFDisco tests https://github.com/tailscale/tailscale/issues/3824 ...
// * server behind a Hard NAT
// * client behind a NAT with UPnP support
// * client machine has a stateful host firewall (e.g. ufw)
func TestBPFDisco(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easyPMPFWPlusBPF, hard)
nt.want(routeDirect)
}
func TestHostFWNoBPF(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easyPMPFWNoBPF, hard)
nt.want(routeDERP)
}
func TestHostFWPair(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easyFW, easyFW)
nt.want(routeDirect)
}
func TestOneHostFW(t *testing.T) {
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(easy, easyFW)
nt.want(routeDirect)
}
var pair = flag.String("pair", "", "comma-separated pair of types to test (easy, easyAF, hard, easyPMP, hardPMP, one2one, sameLAN)")
func TestPair(t *testing.T) {
t1, t2, ok := strings.Cut(*pair, ",")
if !ok {
t.Skipf("skipping test without --pair=type1,type2 set")
}
find := func(name string) addNodeFunc {
for _, nt := range types {
if nt.name == name {
return nt.fn
}
}
t.Fatalf("unknown type %q", name)
return nil
}
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(find(t1), find(t2))
}
var runGrid = flag.Bool("run-grid", false, "run grid test")
func TestGrid(t *testing.T) {
if !*runGrid {
t.Skip("skipping grid test; set --run-grid to run")
}
t.Parallel()
sem := syncs.NewSemaphore(2)
var (
mu sync.Mutex
res = make(map[string]pingRoute)
)
for _, a := range types {
for _, b := range types {
key := a.name + "-" + b.name
keyBack := b.name + "-" + a.name
t.Run(key, func(t *testing.T) {
t.Parallel()
sem.Acquire()
defer sem.Release()
filename := key + ".cache"
contents, _ := os.ReadFile(filename)
if len(contents) == 0 {
filename2 := keyBack + ".cache"
contents, _ = os.ReadFile(filename2)
}
route := pingRoute(strings.TrimSpace(string(contents)))
if route == "" {
nt := newNatTest(t)
route = nt.runTailscaleConnectivityTest(a.fn, b.fn)
if err := os.WriteFile(filename, []byte(string(route)), 0666); err != nil {
t.Fatalf("writeFile: %v", err)
}
}
mu.Lock()
defer mu.Unlock()
res[key] = route
t.Logf("results: %v", res)
})
}
}
t.Cleanup(func() {
mu.Lock()
defer mu.Unlock()
var hb bytes.Buffer
pf := func(format string, args ...any) {
fmt.Fprintf(&hb, format, args...)
}
rewrite := func(s string) string {
return strings.ReplaceAll(s, "PMP", "+pm")
}
pf("<html><table border=1 cellpadding=5>")
pf("<tr><td></td>")
for _, a := range types {
pf("<td><b>%s</b></td>", rewrite(a.name))
}
pf("</tr>\n")
for _, a := range types {
if a.name == "sameLAN" {
continue
}
pf("<tr><td><b>%s</b></td>", rewrite(a.name))
for _, b := range types {
key := a.name + "-" + b.name
key2 := b.name + "-" + a.name
v := cmp.Or(res[key], res[key2], "-")
if v == "derp" {
pf("<td><div style='color: red; font-weight: bold'>%s</div></td>", v)
} else if v == "local" {
pf("<td><div style='color: green; font-weight: bold'>%s</div></td>", v)
} else {
pf("<td>%s</td>", v)
}
}
pf("</tr>\n")
}
pf("</table>")
pf("<b>easy</b>: Endpoint-Independent Mapping, Address and Port-Dependent Filtering (e.g. Linux, Google Wifi, Unifi, eero)<br>")
pf("<b>easyAF</b>: Endpoint-Independent Mapping, Address-Dependent Filtering (James says telephony things or Zyxel type things)<br>")
pf("<b>hard</b>: Address and Port-Dependent Mapping, Address and Port-Dependent Filtering (FreeBSD, OPNSense, pfSense)<br>")
pf("<b>one2one</b>: One-to-One NAT (e.g. an EC2 instance with a public IPv4)<br>")
pf("<b>x+pm</b>: x, with port mapping (NAT-PMP, PCP, UPnP, etc)<br>")
pf("<b>sameLAN</b>: a second node in the same LAN as the first<br>")
pf("</html>")
if err := os.WriteFile("grid.html", hb.Bytes(), 0666); err != nil {
t.Fatalf("writeFile: %v", err)
}
})
}