8203edc099
The label "natlab" is a bit confusing and also used for other things. Instead, change the trigger label to "run-natlab-tests". Updates #13038 Signed-off-by: Claus Lensbøl <claus@tailscale.com>
183 lines
7.4 KiB
YAML
183 lines
7.4 KiB
YAML
# Run the full natlab/vmtest opt-in test suite. These tests boot QEMU VMs
|
|
# (gokrazy, Ubuntu, FreeBSD) and exercise vnet-driven networking scenarios.
|
|
# They are gated behind --run-vm-tests because they need KVM and are slow.
|
|
#
|
|
# This workflow runs:
|
|
# - on demand (workflow_dispatch)
|
|
# - on PRs that carry the "run-natlab-tests" label
|
|
# - on main, every 12 hours, via cron
|
|
#
|
|
# Layout:
|
|
# - "prepare" builds the gokrazy VM image, downloads the cloud images
|
|
# (Ubuntu, FreeBSD), and discovers every Test* function in the two
|
|
# opt-in packages.
|
|
# - "test" is a per-TestFoo matrix that depends on prepare. Each matrix
|
|
# job restores the shared caches and runs a single test. Adding a new
|
|
# TestFoo automatically gets its own job — no workflow edits needed.
|
|
#
|
|
# A separate workflow (.github/workflows/natlab-basic.yml) runs a single
|
|
# canary natlab test on every PR; this one runs the full suite.
|
|
name: "natlab-test"
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
|
cancel-in-progress: true
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
pull_request:
|
|
types: [labeled, synchronize, reopened]
|
|
schedule:
|
|
# Every 12 hours, off-the-hour to avoid GitHub's :00 cron-stampede window.
|
|
- cron: "23 3,15 * * *"
|
|
|
|
jobs:
|
|
# prepare warms the per-workflow-run caches (gokrazy image, cloud VM
|
|
# images) and emits the dynamic matrix of test names. By doing the work
|
|
# once here, the matrix test jobs never race to rebuild or re-download
|
|
# the same artifacts on a cold cache.
|
|
prepare:
|
|
if: |
|
|
github.event_name == 'workflow_dispatch' ||
|
|
github.event_name == 'schedule' ||
|
|
(github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'run-natlab-tests'))
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 30
|
|
outputs:
|
|
matrix: ${{ steps.list.outputs.matrix }}
|
|
steps:
|
|
- name: Check out code
|
|
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
|
|
|
# The cloud VM image cache is keyed only on images.go (image URLs and
|
|
# SHAs), so it survives across workflow runs and is invalidated only
|
|
# when a new image source is added.
|
|
- name: Cache cloud VM images
|
|
uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
|
|
with:
|
|
path: ~/.cache/tailscale/vmtest/images
|
|
key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }}
|
|
|
|
# The gokrazy VM image is keyed by github.sha. That means we rebuild
|
|
# it once per commit but matrix test jobs in the same run all share
|
|
# the result. Per-PR re-runs of the same sha (e.g. a rerun-failed)
|
|
# also get the cache.
|
|
- name: Cache gokrazy VM image
|
|
id: gokrazy-cache
|
|
uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
|
|
with:
|
|
path: gokrazy/natlabapp.qcow2
|
|
key: natlab-gokrazy-${{ github.sha }}
|
|
|
|
# qemu-utils provides qemu-img, which the gokrazy Makefile uses to
|
|
# convert natlabapp.img to qcow2. Only install if we need it (cache
|
|
# miss); the test matrix jobs install qemu separately for the runtime.
|
|
- name: Install qemu-utils
|
|
if: steps.gokrazy-cache.outputs.cache-hit != 'true'
|
|
run: |
|
|
sudo rm -f /var/lib/man-db/auto-update
|
|
sudo apt-get -y update
|
|
sudo apt-get -y remove man-db
|
|
sudo apt-get install -y qemu-utils
|
|
|
|
- name: Download cloud VM images
|
|
# natlabprep is idempotent: it checks the cache before downloading.
|
|
run: |
|
|
./tool/go run ./tstest/natlab/vmtest/cmd/natlabprep
|
|
|
|
- name: Build gokrazy VM image
|
|
if: steps.gokrazy-cache.outputs.cache-hit != 'true'
|
|
run: |
|
|
make -C gokrazy natlab
|
|
|
|
- name: Discover tests
|
|
id: list
|
|
# Grep the test files directly rather than invoking `go test -list`
|
|
# so we don't pay the cost of compiling the test binaries here. The
|
|
# only test functions in these packages use the canonical
|
|
# `func TestFoo(t *testing.T)` signature.
|
|
#
|
|
# exclude is the set of tests that need special invocation
|
|
# (extra flags, a specific environment) and don't fit the
|
|
# single-test-per-matrix-job model. They stay runnable locally.
|
|
run: |
|
|
set -euo pipefail
|
|
exclude='^(TestGrid)$'
|
|
tmp=$(mktemp)
|
|
for pkg_dir in tstest/natlab/vmtest tstest/integration/nat; do
|
|
pkg="./${pkg_dir}/"
|
|
for f in "${pkg_dir}"/*_test.go; do
|
|
[ -e "$f" ] || continue
|
|
grep -hE '^func Test[A-Z][A-Za-z0-9_]*\(t \*testing\.T\)' "$f" \
|
|
| sed -E 's/^func (Test[A-Za-z0-9_]+).*/\1/' \
|
|
| grep -vE "$exclude" \
|
|
| while read -r t; do
|
|
jq -nc --arg pkg "$pkg" --arg test "$t" \
|
|
'{pkg: $pkg, test: $test}' >> "$tmp"
|
|
done
|
|
done
|
|
done
|
|
matrix=$(jq -s -c . "$tmp")
|
|
echo "matrix=${matrix}" >> "$GITHUB_OUTPUT"
|
|
echo "Discovered tests:"
|
|
jq . "$tmp"
|
|
|
|
test:
|
|
needs: prepare
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 20
|
|
name: "${{ matrix.test }}"
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
include: ${{ fromJson(needs.prepare.outputs.matrix) }}
|
|
steps:
|
|
- name: Check out code
|
|
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
|
|
|
- name: Enable KVM
|
|
run: |
|
|
echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules
|
|
sudo udevadm control --reload-rules
|
|
sudo udevadm trigger --name-match=kvm
|
|
|
|
- name: Install qemu
|
|
run: |
|
|
sudo rm -f /var/lib/man-db/auto-update
|
|
sudo apt-get -y update
|
|
sudo apt-get -y remove man-db
|
|
sudo apt-get install -y qemu-system-x86 qemu-utils
|
|
|
|
# restore-only: prepare is the single writer of these caches, so
|
|
# matrix jobs don't write back. fail-on-cache-miss would be too
|
|
# strict for the gokrazy cache (e.g. a non-fatal cache eviction
|
|
# between prepare and us); we just rebuild on miss instead.
|
|
- name: Restore cloud VM images
|
|
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
|
|
with:
|
|
path: ~/.cache/tailscale/vmtest/images
|
|
key: natlab-vmimages-${{ hashFiles('tstest/natlab/vmtest/images.go') }}
|
|
|
|
- name: Restore gokrazy VM image
|
|
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
|
|
with:
|
|
path: gokrazy/natlabapp.qcow2
|
|
key: natlab-gokrazy-${{ github.sha }}
|
|
|
|
# The gokrazy-based tests boot the kernel directly from
|
|
# vmlinuz that ships in the tailscale/gokrazy-kernel module.
|
|
# Tests look it up under GOMODCACHE via findKernelPath, so the
|
|
# module has to be present even though no Go source imports it
|
|
# in the test package itself.
|
|
- name: Download gokrazy-kernel module
|
|
run: |
|
|
./tool/go mod download github.com/tailscale/gokrazy-kernel
|
|
|
|
- name: Run ${{ matrix.test }}
|
|
# Per-test timeout is well above the few-minute typical runtime
|
|
# but small enough that a stuck test fails fast instead of holding
|
|
# the runner for the job's 20-minute budget.
|
|
run: |
|
|
./tool/go test -v -timeout=15m -count=1 ${{ matrix.pkg }} \
|
|
-run='^${{ matrix.test }}$' --run-vm-tests
|