Files
tailscale/tstest/integration/nat/nat_test.go
T
Claus Lensbøl bb47ea2c6b tstest/natlab/vmtest: start migrating old natlab tests to vmtest (#19727)
Instead of having two entry points for running natlab tests, start
converting the connectivity tests to use the vmtest framework.

Grid and pair tests have yet to be moved over.

Updates #13038

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
2026-05-13 16:44:53 -04:00

700 lines
18 KiB
Go

// Copyright (c) Tailscale Inc & contributors
// SPDX-License-Identifier: BSD-3-Clause
package nat
import (
"bytes"
"cmp"
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"net"
"net/http"
"net/netip"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"testing"
"time"
"golang.org/x/mod/modfile"
"golang.org/x/sync/errgroup"
"tailscale.com/client/tailscale"
"tailscale.com/ipn/ipnstate"
"tailscale.com/syncs"
"tailscale.com/tailcfg"
"tailscale.com/tstest/natlab/vnet"
)
var (
runVMTests = flag.Bool("run-vm-tests", false, "run tests that require a VM")
logTailscaled = flag.Bool("log-tailscaled", false, "log tailscaled output")
pcapFile = flag.String("pcap", "", "write pcap to file")
)
type natTest struct {
tb testing.TB
base string // base image
tempDir string // for qcow2 images
vnet *vnet.Server
kernel string // linux kernel path
gotRoute pingRoute
}
func newNatTest(tb testing.TB) *natTest {
root, err := os.Getwd()
if err != nil {
tb.Fatal(err)
}
modRoot := filepath.Join(root, "../../..")
nt := &natTest{
tb: tb,
tempDir: tb.TempDir(),
base: filepath.Join(modRoot, "gokrazy/natlabapp.qcow2"),
}
if !*runVMTests {
tb.Skip("skipping heavy test; set --run-vm-tests to run")
}
if _, err := os.Stat(nt.base); err != nil {
if !os.IsNotExist(err) {
tb.Fatal(err)
}
tb.Logf("building VM image...")
cmd := exec.Command("make", "natlab")
cmd.Dir = filepath.Join(modRoot, "gokrazy")
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout
if err := cmd.Run(); err != nil {
tb.Fatalf("Error running 'make natlab' in gokrazy directory: %v", err)
}
if _, err := os.Stat(nt.base); err != nil {
tb.Skipf("still can't find VM image: %v", err)
}
}
nt.kernel, err = findKernelPath(filepath.Join(modRoot, "go.mod"))
if err != nil {
tb.Skipf("skipping test; kernel not found: %v", err)
}
tb.Logf("found kernel: %v", nt.kernel)
return nt
}
func findKernelPath(goMod string) (string, error) {
b, err := os.ReadFile(goMod)
if err != nil {
return "", err
}
mf, err := modfile.Parse("go.mod", b, nil)
if err != nil {
return "", err
}
goModB, err := exec.Command("go", "env", "GOMODCACHE").CombinedOutput()
if err != nil {
return "", err
}
for _, r := range mf.Require {
if r.Mod.Path == "github.com/tailscale/gokrazy-kernel" {
return strings.TrimSpace(string(goModB)) + "/" + r.Mod.String() + "/vmlinuz", nil
}
}
return "", fmt.Errorf("failed to find kernel in %v", goMod)
}
type addNodeFunc func(c *vnet.Config) *vnet.Node // returns nil to omit test
func easy(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT))
}
func easyAF(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyAFNAT))
}
func sameLAN(c *vnet.Config) *vnet.Node {
nw := c.FirstNetwork()
if nw == nil {
return nil
}
if !nw.CanTakeMoreNodes() {
return nil
}
return c.AddNode(nw)
}
func one2one(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("172.16.%d.1/24", n), vnet.One2OneNAT))
}
func easyPMP(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT, vnet.NATPMP))
}
func hard(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("10.0.%d.1/24", n), vnet.HardNAT))
}
func hardPMP(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
fmt.Sprintf("10.7.%d.1/24", n), vnet.HardNAT, vnet.NATPMP))
}
func (nt *natTest) setupTest(ctx context.Context, addNode ...addNodeFunc) (nodes []*vnet.Node, clients []*vnet.NodeAgentClient, cleanup func()) {
if len(addNode) < 1 || len(addNode) > 2 {
nt.tb.Fatalf("runTest: invalid number of nodes %v; want 1 or 2", len(addNode))
}
t := nt.tb
var c vnet.Config
c.SetPCAPFile(*pcapFile)
for _, fn := range addNode {
node := fn(&c)
if node == nil {
t.Skip("skipping test; not applicable combination")
}
nodes = append(nodes, node)
if *logTailscaled {
node.SetVerboseSyslog(true)
}
}
var err error
nt.vnet, err = vnet.New(&c)
if err != nil {
t.Fatalf("newServer: %v", err)
}
nt.tb.Cleanup(func() {
nt.vnet.Close()
})
var wg sync.WaitGroup // waiting for srv.Accept goroutine
defer wg.Wait()
sockAddr := filepath.Join(nt.tempDir, "qemu.sock")
srv, err := net.Listen("unix", sockAddr)
if err != nil {
t.Fatalf("Listen: %v", err)
}
defer srv.Close()
wg.Go(func() {
for {
c, err := srv.Accept()
if err != nil {
return
}
go nt.vnet.ServeUnixConn(c.(*net.UnixConn), vnet.ProtocolQEMU)
}
})
haveKVM := false
if runtime.GOOS == "linux" {
if f, err := os.OpenFile("/dev/kvm", os.O_RDWR, 0); err == nil {
f.Close()
haveKVM = true
}
}
qmpSocks := make([]string, len(nodes))
for i, node := range nodes {
disk := fmt.Sprintf("%s/node-%d.qcow2", nt.tempDir, i)
out, err := exec.Command("qemu-img", "create",
"-f", "qcow2",
"-F", "qcow2",
"-b", nt.base,
disk).CombinedOutput()
if err != nil {
t.Fatalf("qemu-img create: %v, %s", err, out)
}
var envBuf bytes.Buffer
for _, e := range node.Env() {
fmt.Fprintf(&envBuf, " tailscaled.env=%s=%s", e.Key, e.Value)
}
sysLogAddr := net.JoinHostPort(vnet.FakeSyslogIPv4().String(), "995")
if node.IsV6Only() {
fmt.Fprintf(&envBuf, " tta.nameserver=%s", vnet.FakeDNSIPv6())
sysLogAddr = net.JoinHostPort(vnet.FakeSyslogIPv6().String(), "995")
}
envStr := envBuf.String()
qmpSocks[i] = fmt.Sprintf("%s/qmp-node-%d.sock", nt.tempDir, i)
qemuArgs := []string{
"-M", "microvm,isa-serial=off",
"-m", "384M",
"-nodefaults", "-no-user-config", "-nographic",
"-kernel", nt.kernel,
"-append", "console=hvc0 root=PARTUUID=60c24cc1-f3f9-427a-8199-76baa2d60001/PARTNROFF=1 ro init=/gokrazy/init panic=10 oops=panic pci=off nousb gokrazy.remote_syslog.target=" + sysLogAddr + " tailscale-tta=1" + envStr,
"-drive", "id=blk0,file=" + disk + ",format=qcow2",
"-device", "virtio-blk-device,drive=blk0",
"-netdev", "stream,id=net0,addr.type=unix,addr.path=" + sockAddr,
"-device", "virtio-serial-device",
"-device", "virtio-rng-device",
"-device", "virtio-net-device,netdev=net0,mac=" + node.MAC().String(),
"-chardev", "stdio,id=virtiocon0,mux=on",
"-device", "virtconsole,chardev=virtiocon0",
"-mon", "chardev=virtiocon0,mode=readline",
"-qmp", "unix:" + qmpSocks[i] + ",server=on,wait=off",
}
if haveKVM {
qemuArgs = append(qemuArgs, "-enable-kvm", "-cpu", "host")
}
cmd := exec.Command("qemu-system-x86_64", qemuArgs...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
t.Fatalf("qemu: %v", err)
}
nt.tb.Cleanup(func() {
cmd.Process.Kill()
cmd.Wait()
})
}
for i, node := range nodes {
if err := nt.vnet.AwaitFirstPacket(ctx, node.MAC()); err != nil {
t.Logf("node %v: no boot progress (no packets received): %v", node, err)
t.Logf("node %v: QMP status: %s", node, qmpQueryStatus(qmpSocks[i]))
t.FailNow()
}
t.Logf("node %v: boot detected (first packet received)", node)
}
for _, n := range nodes {
client := nt.vnet.NodeAgentClient(n)
n.SetClient(client)
clients = append(clients, client)
}
var eg errgroup.Group
for i, c := range clients {
eg.Go(func() error {
node := nodes[i]
t.Logf("%v calling Status...", node)
st, err := c.Status(ctx)
if err != nil {
return fmt.Errorf("%v status: %w", node, err)
}
t.Logf("%v status: %v", node, st.BackendState)
if node.HostFirewall() {
if err := c.EnableHostFirewall(ctx); err != nil {
return fmt.Errorf("%v firewall: %w", node, err)
}
t.Logf("%v firewalled", node)
}
if node.ShouldJoinTailnet() {
if err := up(ctx, c); err != nil {
return fmt.Errorf("%v up: %w", node, err)
}
t.Logf("%v up!", node)
st, err = c.Status(ctx)
if err != nil {
return fmt.Errorf("%v status: %w", node, err)
}
if capMap := node.WantCapMap(); capMap != nil {
nt.tb.Logf("using capmap for %s: %+v", node.String(), capMap)
nt.vnet.ControlServer().SetNodeCapMap(st.Self.PublicKey, capMap)
}
if st.BackendState != "Running" {
return fmt.Errorf("%v state = %q", node, st.BackendState)
}
t.Logf("%v AllowedIPs: %v", node, st.Self.Addrs)
t.Logf("%v up with %v", node, st.Self.TailscaleIPs)
} else {
t.Logf("%v skipping joining tailnet", node)
}
return nil
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("initial setup: %v", err)
}
return nodes, clients, nt.vnet.Close
}
type hasDeadline interface {
Deadline() (deadline time.Time, ok bool)
}
// testContext returns a context derived from the test's deadline (from -timeout),
// leaving a small margin for cleanup. Falls back to 60s if no deadline is set.
func testContext(tb testing.TB) (context.Context, context.CancelFunc) {
if t, ok := tb.(hasDeadline); ok {
if dl, ok := t.Deadline(); ok {
const margin = 5 * time.Second
return context.WithDeadline(context.Background(), dl.Add(-margin))
}
}
return context.WithTimeout(context.Background(), 60*time.Second)
}
func (nt *natTest) runTailscaleConnectivityTest(addNode ...addNodeFunc) pingRoute {
ctx, cancel := testContext(nt.tb)
defer cancel()
nodes, clients, cleanup := nt.setupTest(ctx, addNode...)
defer cleanup()
t := nt.tb
if len(nodes) < 2 {
return ""
}
for _, n := range nodes {
if !n.ShouldJoinTailnet() {
t.Logf("%v did not join tailnet", n)
return ""
}
}
sts := make([]*ipnstate.Status, len(nodes))
var eg errgroup.Group
for i, c := range clients {
eg.Go(func() error {
node := nodes[i]
st, err := c.Status(ctx)
if err != nil {
return fmt.Errorf("%v: %w", node, err)
}
sts[i] = st
return nil
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("get node statuses: %v", err)
}
preICMPPing := false
for _, node := range nodes {
node.Network().PostConnectedToControl()
if err := node.PostConnectedToControl(ctx); err != nil {
t.Fatalf("post control error: %s", err)
}
if node.PreICMPPing() {
preICMPPing = true
}
}
// Should we send traffic across the nodes before starting disco?
// For nodes that rotated disco keys after control going away.
if preICMPPing {
_, err := ping(ctx, t, clients[0], sts[1].Self.TailscaleIPs[0], tailcfg.PingICMP)
if err != nil {
t.Fatalf("ICMP ping failure: %v", err)
}
}
pingRes, err := ping(ctx, t, clients[0], sts[1].Self.TailscaleIPs[0], tailcfg.PingDisco)
if err != nil {
t.Logf("ping failure: %v", err)
}
nt.gotRoute = classifyPing(pingRes)
t.Logf("ping route: %v", nt.gotRoute)
return nt.gotRoute
}
func classifyPing(pr *ipnstate.PingResult) pingRoute {
if pr == nil {
return routeNil
}
if pr.Endpoint != "" {
ap, err := netip.ParseAddrPort(pr.Endpoint)
if err == nil {
if ap.Addr().IsPrivate() {
return routeLocal
}
return routeDirect
}
}
return routeDERP // presumably
}
type pingRoute string
const (
routeDERP pingRoute = "derp"
routeLocal pingRoute = "local"
routeDirect pingRoute = "direct"
routeNil pingRoute = "nil" // *ipnstate.PingResult is nil
)
func ping(ctx context.Context, t testing.TB, c *vnet.NodeAgentClient, target netip.Addr, pType tailcfg.PingType) (*ipnstate.PingResult, error) {
var lastRes *ipnstate.PingResult
for n := range 10 {
t.Logf("ping attempt %d to %v ...", n+1, target)
pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
pr, err := c.PingWithOpts(pingCtx, target, pType, tailscale.PingOpts{})
cancel()
if err != nil {
t.Logf("ping attempt %d error: %v", n+1, err)
if ctx.Err() != nil {
break
}
continue
}
if pr.Err != "" {
return nil, errors.New(pr.Err)
}
t.Logf("ping attempt %d: derp=%d endpoint=%v latency=%v", n+1, pr.DERPRegionID, pr.Endpoint, pr.LatencySeconds)
if pr.DERPRegionID == 0 {
return pr, nil
}
lastRes = pr
select {
case <-ctx.Done():
return lastRes, nil
case <-time.After(time.Second):
}
}
if lastRes != nil {
return lastRes, nil
}
return nil, fmt.Errorf("no ping response (ctx: %v)", ctx.Err())
}
// qmpQueryStatus connects to a QEMU QMP socket and returns the VM status
// (e.g. "running", "paused", "prelaunch") or an error string.
func qmpQueryStatus(sockPath string) string {
conn, err := net.DialTimeout("unix", sockPath, 2*time.Second)
if err != nil {
return fmt.Sprintf("dial error: %v", err)
}
defer conn.Close()
conn.SetDeadline(time.Now().Add(5 * time.Second))
dec := json.NewDecoder(conn)
// Read QMP greeting.
var greeting json.RawMessage
if err := dec.Decode(&greeting); err != nil {
return fmt.Sprintf("greeting error: %v", err)
}
// Enter command mode.
if _, err := conn.Write([]byte(`{"execute":"qmp_capabilities"}` + "\n")); err != nil {
return fmt.Sprintf("write caps: %v", err)
}
var capsResp json.RawMessage
if err := dec.Decode(&capsResp); err != nil {
return fmt.Sprintf("caps response: %v", err)
}
// Query status.
if _, err := conn.Write([]byte(`{"execute":"query-status"}` + "\n")); err != nil {
return fmt.Sprintf("write query-status: %v", err)
}
var statusResp struct {
Return struct {
Running bool `json:"running"`
Status string `json:"status"`
} `json:"return"`
Error *struct {
Class string `json:"class"`
Desc string `json:"desc"`
} `json:"error"`
}
if err := dec.Decode(&statusResp); err != nil {
return fmt.Sprintf("status response: %v", err)
}
if statusResp.Error != nil {
return fmt.Sprintf("qmp error: %s: %s", statusResp.Error.Class, statusResp.Error.Desc)
}
return fmt.Sprintf("status=%s running=%v", statusResp.Return.Status, statusResp.Return.Running)
}
func up(ctx context.Context, c *vnet.NodeAgentClient) error {
req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/up", nil)
if err != nil {
return err
}
res, err := c.HTTPClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
all, _ := io.ReadAll(res.Body)
if res.StatusCode != 200 {
return fmt.Errorf("unexpected status code %v: %s", res.Status, all)
}
return nil
}
type nodeType struct {
name string
fn addNodeFunc
}
var types = []nodeType{
{"easy", easy},
{"easyAF", easyAF},
{"hard", hard},
{"easyPMP", easyPMP},
{"hardPMP", hardPMP},
{"one2one", one2one},
{"sameLAN", sameLAN},
{"cgnat", cgnatNoTailnet},
}
func cgnatNoTailnet(c *vnet.Config) *vnet.Node {
n := c.NumNodes() + 1
return c.AddNode(c.AddNetwork(
fmt.Sprintf("100.65.%d.1/16", n),
fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP
vnet.EasyNAT),
vnet.DontJoinTailnet)
}
var pair = flag.String("pair", "", "comma-separated pair of types to test (easy, easyAF, hard, easyPMP, hardPMP, one2one, sameLAN)")
func TestPair(t *testing.T) {
t1, t2, ok := strings.Cut(*pair, ",")
if !ok {
t.Skipf("skipping test without --pair=type1,type2 set")
}
find := func(name string) addNodeFunc {
for _, nt := range types {
if nt.name == name {
return nt.fn
}
}
t.Fatalf("unknown type %q", name)
return nil
}
nt := newNatTest(t)
nt.runTailscaleConnectivityTest(find(t1), find(t2))
}
var runGrid = flag.Bool("run-grid", false, "run grid test")
func TestGrid(t *testing.T) {
if !*runGrid {
t.Skip("skipping grid test; set --run-grid to run")
}
t.Parallel()
sem := syncs.NewSemaphore(2)
var (
mu sync.Mutex
res = make(map[string]pingRoute)
)
for _, a := range types {
for _, b := range types {
key := a.name + "-" + b.name
keyBack := b.name + "-" + a.name
t.Run(key, func(t *testing.T) {
t.Parallel()
sem.Acquire()
defer sem.Release()
filename := key + ".cache"
contents, _ := os.ReadFile(filename)
if len(contents) == 0 {
filename2 := keyBack + ".cache"
contents, _ = os.ReadFile(filename2)
}
route := pingRoute(strings.TrimSpace(string(contents)))
if route == "" {
nt := newNatTest(t)
route = nt.runTailscaleConnectivityTest(a.fn, b.fn)
if err := os.WriteFile(filename, []byte(string(route)), 0666); err != nil {
t.Fatalf("writeFile: %v", err)
}
}
mu.Lock()
defer mu.Unlock()
res[key] = route
t.Logf("results: %v", res)
})
}
}
t.Cleanup(func() {
mu.Lock()
defer mu.Unlock()
var hb bytes.Buffer
pf := func(format string, args ...any) {
fmt.Fprintf(&hb, format, args...)
}
rewrite := func(s string) string {
return strings.ReplaceAll(s, "PMP", "+pm")
}
pf("<html><table border=1 cellpadding=5>")
pf("<tr><td></td>")
for _, a := range types {
pf("<td><b>%s</b></td>", rewrite(a.name))
}
pf("</tr>\n")
for _, a := range types {
if a.name == "sameLAN" {
continue
}
pf("<tr><td><b>%s</b></td>", rewrite(a.name))
for _, b := range types {
key := a.name + "-" + b.name
key2 := b.name + "-" + a.name
v := cmp.Or(res[key], res[key2], "-")
if v == "derp" {
pf("<td><div style='color: red; font-weight: bold'>%s</div></td>", v)
} else if v == "local" {
pf("<td><div style='color: green; font-weight: bold'>%s</div></td>", v)
} else {
pf("<td>%s</td>", v)
}
}
pf("</tr>\n")
}
pf("</table>")
pf("<b>easy</b>: Endpoint-Independent Mapping, Address and Port-Dependent Filtering (e.g. Linux, Google Wifi, Unifi, eero)<br>")
pf("<b>easyAF</b>: Endpoint-Independent Mapping, Address-Dependent Filtering (James says telephony things or Zyxel type things)<br>")
pf("<b>hard</b>: Address and Port-Dependent Mapping, Address and Port-Dependent Filtering (FreeBSD, OPNSense, pfSense)<br>")
pf("<b>one2one</b>: One-to-One NAT (e.g. an EC2 instance with a public IPv4)<br>")
pf("<b>x+pm</b>: x, with port mapping (NAT-PMP, PCP, UPnP, etc)<br>")
pf("<b>sameLAN</b>: a second node in the same LAN as the first<br>")
pf("</html>")
if err := os.WriteFile("grid.html", hb.Bytes(), 0666); err != nil {
t.Fatalf("writeFile: %v", err)
}
})
}