net/{batching,udprelay},wgengine/magicsock: add SO_RXQ_OVFL clientmetrics
For the purpose of improved observability of UDP socket receive buffer overflows on Linux. Updates tailscale/corp#37679 Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
committed by
Jordan Whited
parent
660a4608d2
commit
96dde53b43
+147
-25
@@ -24,6 +24,7 @@ import (
|
||||
"tailscale.com/net/neterror"
|
||||
"tailscale.com/net/packet"
|
||||
"tailscale.com/types/nettype"
|
||||
"tailscale.com/util/clientmetric"
|
||||
)
|
||||
|
||||
// xnetBatchReaderWriter defines the batching i/o methods of
|
||||
@@ -58,6 +59,14 @@ type linuxBatchingConn struct {
|
||||
setGSOSizeInControl func(control *[]byte, gsoSize uint16) // typically setGSOSizeInControl(); swappable for testing
|
||||
getGSOSizeFromControl func(control []byte) (int, error) // typically getGSOSizeFromControl(); swappable for testing
|
||||
sendBatchPool sync.Pool
|
||||
rxqOverflowsMetric *clientmetric.Metric
|
||||
|
||||
// readOpMu guards read operations that must perform accounting against
|
||||
// rxqOverflows in single-threaded fashion. There are no concurrent usages
|
||||
// of read operations at the time of writing (2026-03-09), but it would be
|
||||
// unidiomatic to push this responsibility onto callers.
|
||||
readOpMu sync.Mutex
|
||||
rxqOverflows uint32 // kernel pumps a cumulative counter, which we track to push a clientmetric delta value
|
||||
}
|
||||
|
||||
func (c *linuxBatchingConn) ReadFromUDPAddrPort(p []byte) (n int, addr netip.AddrPort, err error) {
|
||||
@@ -285,16 +294,87 @@ func (c *linuxBatchingConn) splitCoalescedMessages(msgs []ipv6.Message, firstMsg
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// getDataFromControl returns the data portion of the first control msg with
|
||||
// matching cmsgLevel, matching cmsgType, and min data len of minDataLen, in
|
||||
// control. If no matching cmsg is found or the len(control) < unix.SizeofCmsghdr,
|
||||
// this function returns nil data. A non-nil error will be returned if
|
||||
// len(control) > unix.SizeofCmsghdr but its contents cannot be parsed as a
|
||||
// socket control message.
|
||||
func getDataFromControl(control []byte, cmsgLevel, cmsgType int32, minDataLen int) ([]byte, error) {
|
||||
var (
|
||||
hdr unix.Cmsghdr
|
||||
data []byte
|
||||
rem = control
|
||||
err error
|
||||
)
|
||||
|
||||
for len(rem) > unix.SizeofCmsghdr {
|
||||
hdr, data, rem, err = unix.ParseOneSocketControlMessage(rem)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing socket control message: %w", err)
|
||||
}
|
||||
if hdr.Level == cmsgLevel && hdr.Type == cmsgType && len(data) >= minDataLen {
|
||||
return data, nil
|
||||
}
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// getRXQOverflowsFromControl returns the rxq overflows cumulative counter found
|
||||
// in control. If no rxq counter is found or the len(control) < unix.SizeofCmsghdr,
|
||||
// this function returns 0. A non-nil error will be returned if control is
|
||||
// malformed.
|
||||
func getRXQOverflowsFromControl(control []byte) (uint32, error) {
|
||||
data, err := getDataFromControl(control, unix.SOL_SOCKET, unix.SO_RXQ_OVFL, 4)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if len(data) >= 4 {
|
||||
return binary.NativeEndian.Uint32(data), nil
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// handleRXQOverflowCounter handles any rx queue overflow counter contained in
|
||||
// the tail of msgs.
|
||||
func (c *linuxBatchingConn) handleRXQOverflowCounter(msgs []ipv6.Message, n int, rxErr error) {
|
||||
if n == 0 || rxErr != nil || c.rxqOverflowsMetric == nil {
|
||||
return
|
||||
}
|
||||
tailMsg := msgs[n-1] // we only care about the latest value as it's a cumulative counter
|
||||
if tailMsg.NN == 0 {
|
||||
return
|
||||
}
|
||||
rxqOverflows, err := getRXQOverflowsFromControl(tailMsg.OOB[:tailMsg.NN])
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
// The counter is always present once nonzero on the kernel side. Compare it
|
||||
// with our previous view, push the delta to the clientmetric, and update
|
||||
// our view.
|
||||
if rxqOverflows == c.rxqOverflows {
|
||||
return
|
||||
}
|
||||
delta := int64(rxqOverflows - c.rxqOverflows)
|
||||
c.rxqOverflowsMetric.Add(delta)
|
||||
c.rxqOverflows = rxqOverflows
|
||||
}
|
||||
|
||||
func (c *linuxBatchingConn) ReadBatch(msgs []ipv6.Message, flags int) (n int, err error) {
|
||||
c.readOpMu.Lock()
|
||||
defer c.readOpMu.Unlock()
|
||||
if !c.rxOffload || len(msgs) < 2 {
|
||||
return c.xpc.ReadBatch(msgs, flags)
|
||||
n, err = c.xpc.ReadBatch(msgs, flags)
|
||||
c.handleRXQOverflowCounter(msgs, n, err)
|
||||
return n, err
|
||||
}
|
||||
// Read into the tail of msgs, split into the head.
|
||||
readAt := len(msgs) - 2
|
||||
numRead, err := c.xpc.ReadBatch(msgs[readAt:], 0)
|
||||
if err != nil || numRead == 0 {
|
||||
n, err = c.xpc.ReadBatch(msgs[readAt:], 0)
|
||||
if err != nil || n == 0 {
|
||||
return 0, err
|
||||
}
|
||||
c.handleRXQOverflowCounter(msgs[readAt:], n, err)
|
||||
return c.splitCoalescedMessages(msgs, readAt)
|
||||
}
|
||||
|
||||
@@ -310,6 +390,21 @@ func (c *linuxBatchingConn) Close() error {
|
||||
return c.pc.Close()
|
||||
}
|
||||
|
||||
// tryEnableRXQOverflowsCounter attempts to enable the SO_RXQ_OVFL socket option
|
||||
// on pconn, and returns the result. SO_RXQ_OVFL was added in Linux v2.6.33.
|
||||
func tryEnableRXQOverflowsCounter(pconn nettype.PacketConn) (enabled bool) {
|
||||
if c, ok := pconn.(*net.UDPConn); ok {
|
||||
rc, err := c.SyscallConn()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
rc.Control(func(fd uintptr) {
|
||||
enabled = syscall.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RXQ_OVFL, 1) == nil
|
||||
})
|
||||
}
|
||||
return enabled
|
||||
}
|
||||
|
||||
// tryEnableUDPOffload attempts to enable the UDP_GRO socket option on pconn,
|
||||
// and returns two booleans indicating TX and RX UDP offload support.
|
||||
func tryEnableUDPOffload(pconn nettype.PacketConn) (hasTX bool, hasRX bool) {
|
||||
@@ -333,24 +428,14 @@ func tryEnableUDPOffload(pconn nettype.PacketConn) (hasTX bool, hasRX bool) {
|
||||
|
||||
// getGSOSizeFromControl returns the GSO size found in control. If no GSO size
|
||||
// is found or the len(control) < unix.SizeofCmsghdr, this function returns 0.
|
||||
// A non-nil error will be returned if len(control) > unix.SizeofCmsghdr but
|
||||
// its contents cannot be parsed as a socket control message.
|
||||
// A non-nil error will be returned if control is malformed.
|
||||
func getGSOSizeFromControl(control []byte) (int, error) {
|
||||
var (
|
||||
hdr unix.Cmsghdr
|
||||
data []byte
|
||||
rem = control
|
||||
err error
|
||||
)
|
||||
|
||||
for len(rem) > unix.SizeofCmsghdr {
|
||||
hdr, data, rem, err = unix.ParseOneSocketControlMessage(rem)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("error parsing socket control message: %w", err)
|
||||
}
|
||||
if hdr.Level == unix.SOL_UDP && hdr.Type == unix.UDP_GRO && len(data) >= 2 {
|
||||
return int(binary.NativeEndian.Uint16(data[:2])), nil
|
||||
}
|
||||
data, err := getDataFromControl(control, unix.SOL_UDP, unix.UDP_GRO, 2)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if len(data) >= 2 {
|
||||
return int(binary.NativeEndian.Uint16(data)), nil
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
@@ -374,10 +459,39 @@ func setGSOSizeInControl(control *[]byte, gsoSize uint16) {
|
||||
*control = (*control)[:unix.CmsgSpace(2)]
|
||||
}
|
||||
|
||||
var (
|
||||
rxqOverflowsMetricsMu sync.Mutex
|
||||
rxqOverflowsMetricsByName map[string]*clientmetric.Metric
|
||||
)
|
||||
|
||||
// getRXQOverflowsMetric returns a counter-based [*clientmetric.Metric] for the
|
||||
// provided name in a thread-safe manner. Callers may pass the same metric name
|
||||
// multiple times, which is common across rebinds of the underlying, associated
|
||||
// [Conn].
|
||||
func getRXQOverflowsMetric(name string) *clientmetric.Metric {
|
||||
if len(name) == 0 {
|
||||
return nil
|
||||
}
|
||||
rxqOverflowsMetricsMu.Lock()
|
||||
defer rxqOverflowsMetricsMu.Unlock()
|
||||
m, ok := rxqOverflowsMetricsByName[name]
|
||||
if ok {
|
||||
return m
|
||||
}
|
||||
if rxqOverflowsMetricsByName == nil {
|
||||
rxqOverflowsMetricsByName = make(map[string]*clientmetric.Metric)
|
||||
}
|
||||
m = clientmetric.NewCounter(name)
|
||||
rxqOverflowsMetricsByName[name] = m
|
||||
return m
|
||||
}
|
||||
|
||||
// TryUpgradeToConn probes the capabilities of the OS and pconn, and upgrades
|
||||
// pconn to a [Conn] if appropriate. A batch size of [IdealBatchSize] is
|
||||
// suggested for the best performance.
|
||||
func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int) nettype.PacketConn {
|
||||
// suggested for the best performance. If len(rxqOverflowsMetricName) is
|
||||
// nonzero, then read ops will propagate the SO_RXQ_OVFL control message counter
|
||||
// to a clientmetric with the supplied name.
|
||||
func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int, rxqOverflowsMetricName string) nettype.PacketConn {
|
||||
if runtime.GOOS != "linux" {
|
||||
// Exclude Android.
|
||||
return pconn
|
||||
@@ -431,15 +545,23 @@ func TryUpgradeToConn(pconn nettype.PacketConn, network string, batchSize int) n
|
||||
var txOffload bool
|
||||
txOffload, b.rxOffload = tryEnableUDPOffload(uc)
|
||||
b.txOffload.Store(txOffload)
|
||||
if len(rxqOverflowsMetricName) > 0 && tryEnableRXQOverflowsCounter(uc) {
|
||||
// Don't register the metric unless the socket option has been
|
||||
// successfully set, otherwise we will report a misleading zero value
|
||||
// counter on the wire. This is one reason why we prefer to handle
|
||||
// clientmetric instantiation internally, vs letting callers pass them
|
||||
// to TryUpgradeToConn.
|
||||
b.rxqOverflowsMetric = getRXQOverflowsMetric(rxqOverflowsMetricName)
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
var controlMessageSize = -1 // bomb if used for allocation before init
|
||||
|
||||
func init() {
|
||||
// controlMessageSize is set to hold a UDP_GRO or UDP_SEGMENT control
|
||||
// message. These contain a single uint16 of data.
|
||||
controlMessageSize = unix.CmsgSpace(2)
|
||||
controlMessageSize =
|
||||
unix.CmsgSpace(2) + // UDP_GRO or UDP_SEGMENT gsoSize (uint16)
|
||||
unix.CmsgSpace(4) // SO_RXQ_OVFL counter (uint32)
|
||||
}
|
||||
|
||||
// MinControlMessageSize returns the minimum control message size required to
|
||||
|
||||
Reference in New Issue
Block a user