net/{batching,udprelay},wgengine/magicsock: add SO_RXQ_OVFL clientmetrics

For the purpose of improved observability of UDP socket receive buffer
overflows on Linux.

Updates tailscale/corp#37679

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited
2026-03-11 13:02:09 -07:00
committed by Jordan Whited
parent 660a4608d2
commit 96dde53b43
5 changed files with 330 additions and 53 deletions
+147 -25
View File
@@ -24,6 +24,7 @@ import (
"tailscale.com/net/neterror"
"tailscale.com/net/packet"
"tailscale.com/types/nettype"
"tailscale.com/util/clientmetric"
)
// xnetBatchReaderWriter defines the batching i/o methods of
@@ -58,6 +59,14 @@ type linuxBatchingConn struct {
setGSOSizeInControl func(control *[]byte, gsoSize uint16) // typically setGSOSizeInControl(); swappable for testing
getGSOSizeFromControl func(control []byte) (int, error) // typically getGSOSizeFromControl(); swappable for testing
sendBatchPool sync.Pool
rxqOverflowsMetric *clientmetric.Metric
// readOpMu guards read operations that must perform accounting against
// rxqOverflows in single-threaded fashion. There are no concurrent usages
// of read operations at the time of writing (2026-03-09), but it would be
// unidiomatic to push this responsibility onto callers.
readOpMu sync.Mutex
rxqOverflows uint32 // kernel pumps a cumulative counter, which we track to push a clientmetric delta value
}
func (c *linuxBatchingConn) ReadFromUDPAddrPort(p []byte) (n int, addr netip.AddrPort, err error) {
@@ -285,16 +294,87 @@ func (c *linuxBatchingConn) splitCoalescedMessages(msgs []ipv6.Message, firstMsg
return n, nil
}
// getDataFromControl returns the data portion of the first control msg with
// matching cmsgLevel, matching cmsgType, and min data len of minDataLen, in
// control. If no matching cmsg is found or the len(control) < unix.SizeofCmsghdr,
// this function returns nil data. A non-nil error will be returned if
// len(control) > unix.SizeofCmsghdr but its contents cannot be parsed as a
// socket control message.
func getDataFromControl(control []byte, cmsgLevel, cmsgType int32, minDataLen int) ([]byte, error) {
var (
hdr unix.Cmsghdr
data []byte
rem = control
err error
)
for len(rem) > unix.SizeofCmsghdr {
hdr, data, rem, err = unix.ParseOneSocketControlMessage(rem)
if err != nil {
return nil, fmt.Errorf("error parsing socket control message: %w", err)
}
if hdr.Level == cmsgLevel && hdr.Type == cmsgType && len(data) >= minDataLen {
return data, nil
}
}
return nil, nil
}
// getRXQOverflowsFromControl returns the rxq overflows cumulative counter found
// in control. If no rxq counter is found or the len(control) < unix.SizeofCmsghdr,
// this function returns 0. A non-nil error will be returned if control is
// malformed.
func getRXQOverflowsFromControl(control []byte) (uint32, error) {
data, err := getDataFromControl(control, unix.SOL_SOCKET, unix.SO_RXQ_OVFL, 4)
if err != nil {
return 0, err
}
if len(data) >= 4 {
return binary.NativeEndian.Uint32(data), nil
}
return 0, nil
}
// handleRXQOverflowCounter handles any rx queue overflow counter contained in
// the tail of msgs.
func (c *linuxBatchingConn) handleRXQOverflowCounter(msgs []ipv6.Message, n int, rxErr error) {
if n == 0 || rxErr != nil || c.rxqOverflowsMetric == nil {
return
}
tailMsg := msgs[n-1] // we only care about the latest value as it's a cumulative counter
if tailMsg.NN == 0 {
return
}
rxqOverflows, err := getRXQOverflowsFromControl(tailMsg.OOB[:tailMsg.NN])
if err != nil {
return
}
// The counter is always present once nonzero on the kernel side. Compare it
// with our previous view, push the delta to the clientmetric, and update
// our view.
if rxqOverflows == c.rxqOverflows {
return
}
delta := int64(rxqOverflows - c.rxqOverflows)
c.rxqOverflowsMetric.Add(delta)
c.rxqOverflows = rxqOverflows
}
func (c *linuxBatchingConn) ReadBatch(msgs []ipv6.Message, flags int) (n int, err error) {
c.readOpMu.Lock()
defer c.readOpMu.Unlock()
if !c.rxOffload || len(msgs) < 2 {
return c.xpc.ReadBatch(msgs, flags)
n, err = c.xpc.ReadBatch(msgs, flags)
c.handleRXQOverflowCounter(msgs, n, err)
return n, err
}
// Read into the tail of msgs, split into the head.
readAt := len(msgs) - 2
numRead, err := c.xpc.ReadBatch(msgs[readAt:], 0)
if err != nil || numRead == 0 {
n, err = c.xpc.ReadBatch(msgs[readAt:], 0)
if err != nil || n == 0 {
return 0, err
}
c.handleRXQOverflowCounter(msgs[readAt:], n, err)
return c.splitCoalescedMessages(msgs, readAt)
}
@@ -310,6 +390,21 @@ func (c *linuxBatchingConn) Close() error {
return c.pc.Close()
}
// tryEnableRXQOverflowsCounter attempts to enable the SO_RXQ_OVFL socket option
// on pconn, and returns the result. SO_RXQ_OVFL was added in Linux v2.6.33.
func tryEnableRXQOverflowsCounter(pconn nettype.PacketConn) (enabled bool) {
if c, ok := pconn.(*net.UDPConn); ok {
rc, err := c.SyscallConn()
if err != nil {
return
}
rc.Control(func(fd uintptr) {
enabled = syscall.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RXQ_OVFL, 1) == nil
})
}
return enabled
}
// tryEnableUDPOffload attempts to enable the UDP_GRO socket option on pconn,
// and returns two booleans indicating TX and RX UDP offload support.
func tryEnableUDPOffload(pconn nettype.PacketConn) (hasTX bool, hasRX bool) {
@@ -333,24 +428,14 @@ func tryEnableUDPOffload(pconn nettype.PacketConn) (hasTX bool, hasRX bool) {
// getGSOSizeFromControl returns the GSO size found in control. If no GSO size
// is found or the len(control) < unix.SizeofCmsghdr, this function returns 0.
// A non-nil error will be returned if len(control) > unix.SizeofCmsghdr but
// its contents cannot be parsed as a socket control message.
// A non-nil error will be returned if control is malformed.
func getGSOSizeFromControl(control []byte) (int, error) {
var (
hdr unix.Cmsghdr
data []byte
rem = control
err error
)
for len(rem) > unix.SizeofCmsghdr {
hdr, data, rem, err = unix.ParseOneSocketControlMessage(rem)
if err != nil {
return 0, fmt.Errorf("error parsing socket control message: %w", err)
}
if hdr.Level == unix.SOL_UDP && hdr.Type == unix.UDP_GRO && len(data) >= 2 {
return int(binary.NativeEndian.Uint16(data[:2])), nil
}
data, err := getDataFromControl(control, unix.SOL_UDP, unix.UDP_GRO, 2)
if err != nil {
return 0, err
}
if len(data) >= 2 {
return int(binary.NativeEndian.Uint16(data)), nil
}
return 0, nil
}
@@ -374,10 +459,39 @@ func setGSOSizeInControl(control *[]byte, gsoSize uint16) {
*control = (*control)[:unix.CmsgSpace(2)]
}
var (
rxqOverflowsMetricsMu sync.Mutex
rxqOverflowsMetricsByName map[string]*clientmetric.Metric
)
// getRXQOverflowsMetric returns a counter-based [*clientmetric.Metric] for the
// provided name in a thread-safe manner. Callers may pass the same metric name
// multiple times, which is common across rebinds of the underlying, associated
// [Conn].
func getRXQOverflowsMetric(name string) *clientmetric.Metric {
if len(name) == 0 {
return nil
}
rxqOverflowsMetricsMu.Lock()
defer rxqOverflowsMetricsMu.Unlock()
m, ok := rxqOverflowsMetricsByName[name]
if ok {
return m
}
if rxqOverflowsMetricsByName == nil {
rxqOverflowsMetricsByName = make(map[string]*clientmetric.Metric)
}
m = clientmetric.NewCounter(name)
rxqOverflowsMetricsByName[name] = m
return m
}
// TryUpgradeToConn probes the capabilities of the OS and pconn, and upgrades
// pconn to a [Conn] if appropriate. A batch size of [IdealBatchSize] is
// suggested for the best performance.
func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int) nettype.PacketConn {
// suggested for the best performance. If len(rxqOverflowsMetricName) is
// nonzero, then read ops will propagate the SO_RXQ_OVFL control message counter
// to a clientmetric with the supplied name.
func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int, rxqOverflowsMetricName string) nettype.PacketConn {
if runtime.GOOS != "linux" {
// Exclude Android.
return pconn
@@ -431,15 +545,23 @@ func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int) n
var txOffload bool
txOffload, b.rxOffload = tryEnableUDPOffload(uc)
b.txOffload.Store(txOffload)
if len(rxqOverflowsMetricName) > 0 && tryEnableRXQOverflowsCounter(uc) {
// Don't register the metric unless the socket option has been
// successfully set, otherwise we will report a misleading zero value
// counter on the wire. This is one reason why we prefer to handle
// clientmetric instantiation internally, vs letting callers pass them
// to TryUpgradeToConn.
b.rxqOverflowsMetric = getRXQOverflowsMetric(rxqOverflowsMetricName)
}
return b
}
var controlMessageSize = -1 // bomb if used for allocation before init
func init() {
// controlMessageSize is set to hold a UDP_GRO or UDP_SEGMENT control
// message. These contain a single uint16 of data.
controlMessageSize = unix.CmsgSpace(2)
controlMessageSize =
unix.CmsgSpace(2) + // UDP_GRO or UDP_SEGMENT gsoSize (uint16)
unix.CmsgSpace(4) // SO_RXQ_OVFL counter (uint32)
}
// MinControlMessageSize returns the minimum control message size required to