net/udprelay: add tailscaled_peer_relay_endpoints gauge (#18265)

New gauge reflects endpoints state via labels:
- open, when both peers are connected and ready to talk, and
- connecting. when at least one peer hasn't connected yet.

Corresponding client metrics are logged as
- udprelay_endpoints_connecting
- udprelay_endpoints_open

Updates tailscale/corp#30820

Change-Id: Idb1baa90a38c97847e14f9b2390093262ad0ea23

Signed-off-by: Alex Valiushko <alexvaliushko@tailscale.com>
This commit is contained in:
Alex Valiushko
2026-01-21 21:55:37 -08:00
committed by GitHub
parent 6dc0bd834c
commit 4b7585df77
4 changed files with 258 additions and 24 deletions
+71 -21
View File
@@ -122,6 +122,7 @@ type serverEndpoint struct {
allocatedAt mono.Time
mu sync.Mutex // guards the following fields
closed bool // signals that no new data should be accepted
inProgressGeneration [2]uint32 // or zero if a handshake has never started, or has just completed
boundAddrPorts [2]netip.AddrPort // or zero value if a handshake has never completed for that relay leg
lastSeen [2]mono.Time
@@ -151,9 +152,15 @@ func blakeMACFromBindMsg(blakeKey [blake2s.Size]byte, src netip.AddrPort, msg di
return out, nil
}
func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets views.Slice[[blake2s.Size]byte], now mono.Time) (write []byte, to netip.AddrPort) {
func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets views.Slice[[blake2s.Size]byte], now mono.Time, m endpointUpdater) (write []byte, to netip.AddrPort) {
e.mu.Lock()
defer e.mu.Unlock()
lastState := e.stateLocked()
if lastState == endpointClosed {
// endpoint was closed in [Server.endpointGC]
return nil, netip.AddrPort{}
}
if senderIndex != 0 && senderIndex != 1 {
return nil, netip.AddrPort{}
@@ -230,6 +237,7 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex
if bytes.Equal(mac[:], discoMsg.Challenge[:]) {
// Handshake complete. Update the binding for this sender.
e.boundAddrPorts[senderIndex] = from
m.updateEndpoint(lastState, e.stateLocked())
e.lastSeen[senderIndex] = now // record last seen as bound time
e.inProgressGeneration[senderIndex] = 0 // reset to zero, which indicates there is no in-progress handshake
return nil, netip.AddrPort{}
@@ -243,7 +251,7 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex
}
}
func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets views.Slice[[blake2s.Size]byte], now mono.Time) (write []byte, to netip.AddrPort) {
func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets views.Slice[[blake2s.Size]byte], now mono.Time, m endpointUpdater) (write []byte, to netip.AddrPort) {
senderRaw, isDiscoMsg := disco.Source(b)
if !isDiscoMsg {
// Not a Disco message
@@ -274,7 +282,7 @@ func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []by
return nil, netip.AddrPort{}
}
return e.handleDiscoControlMsg(from, senderIndex, discoMsg, serverDisco, macSecrets, now)
return e.handleDiscoControlMsg(from, senderIndex, discoMsg, serverDisco, macSecrets, now, m)
}
func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now mono.Time) (write []byte, to netip.AddrPort) {
@@ -284,6 +292,10 @@ func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now mon
// not a control packet, but serverEndpoint isn't bound
return nil, netip.AddrPort{}
}
if e.stateLocked() == endpointClosed {
// endpoint was closed in [Server.endpointGC]
return nil, netip.AddrPort{}
}
switch {
case from == e.boundAddrPorts[0]:
e.lastSeen[0] = now
@@ -301,9 +313,21 @@ func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now mon
}
}
func (e *serverEndpoint) isExpired(now mono.Time, bindLifetime, steadyStateLifetime time.Duration) bool {
// maybeExpire checks if the endpoint has expired according to the provided timeouts and sets its closed state accordingly.
// True is returned if the endpoint was expired and closed.
func (e *serverEndpoint) maybeExpire(now mono.Time, bindLifetime, steadyStateLifetime time.Duration, m endpointUpdater) bool {
e.mu.Lock()
defer e.mu.Unlock()
before := e.stateLocked()
if e.isExpiredLocked(now, bindLifetime, steadyStateLifetime) {
e.closed = true
m.updateEndpoint(before, e.stateLocked())
return true
}
return false
}
func (e *serverEndpoint) isExpiredLocked(now mono.Time, bindLifetime, steadyStateLifetime time.Duration) bool {
if !e.isBoundLocked() {
if now.Sub(e.allocatedAt) > bindLifetime {
return true
@@ -323,6 +347,31 @@ func (e *serverEndpoint) isBoundLocked() bool {
e.boundAddrPorts[1].IsValid()
}
// stateLocked returns current endpointState according to the
// peers handshake status.
func (e *serverEndpoint) stateLocked() endpointState {
switch {
case e == nil, e.closed:
return endpointClosed
case e.boundAddrPorts[0].IsValid() && e.boundAddrPorts[1].IsValid():
return endpointOpen
default:
return endpointConnecting
}
}
// endpointState canonicalizes endpoint state names,
// see [serverEndpoint.stateLocked].
//
// Usermetrics can't handle Stringer, must be a string enum.
type endpointState string
const (
endpointClosed endpointState = "closed" // unallocated, not tracked in metrics
endpointConnecting endpointState = "connecting" // at least one peer has not completed handshake
endpointOpen endpointState = "open" // ready to forward
)
// NewServer constructs a [Server] listening on port. If port is zero, then
// port selection is left up to the host networking stack. If
// onlyStaticAddrPorts is true, then dynamic addr:port discovery will be
@@ -703,33 +752,33 @@ func (s *Server) Close() error {
clear(s.serverEndpointByDisco)
s.closed = true
s.bus.Close()
deregisterMetrics()
})
return nil
}
func (s *Server) endpointGC(bindLifetime, steadyStateLifetime time.Duration) {
now := mono.Now()
// TODO: consider performance implications of scanning all endpoints and
// holding s.mu for the duration. Keep it simple (and slow) for now.
s.mu.Lock()
defer s.mu.Unlock()
for k, v := range s.serverEndpointByDisco {
if v.maybeExpire(now, bindLifetime, steadyStateLifetime, s.metrics) {
delete(s.serverEndpointByDisco, k)
s.serverEndpointByVNI.Delete(v.vni)
}
}
}
func (s *Server) endpointGCLoop() {
defer s.wg.Done()
ticker := time.NewTicker(s.bindLifetime)
defer ticker.Stop()
gc := func() {
now := mono.Now()
// TODO: consider performance implications of scanning all endpoints and
// holding s.mu for the duration. Keep it simple (and slow) for now.
s.mu.Lock()
defer s.mu.Unlock()
for k, v := range s.serverEndpointByDisco {
if v.isExpired(now, s.bindLifetime, s.steadyStateLifetime) {
delete(s.serverEndpointByDisco, k)
s.serverEndpointByVNI.Delete(v.vni)
}
}
}
for {
select {
case <-ticker.C:
gc()
s.endpointGC(s.bindLifetime, s.steadyStateLifetime)
case <-s.closeCh:
return
}
@@ -773,7 +822,7 @@ func (s *Server) handlePacket(from netip.AddrPort, b []byte) (write []byte, to n
}
msg := b[packet.GeneveFixedHeaderLength:]
secrets := s.getMACSecrets(now)
write, to = e.(*serverEndpoint).handleSealedDiscoControlMsg(from, msg, s.discoPublic, secrets, now)
write, to = e.(*serverEndpoint).handleSealedDiscoControlMsg(from, msg, s.discoPublic, secrets, now, s.metrics)
isDataPacket = false
return
}
@@ -1015,6 +1064,7 @@ func (s *Server) AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.Serv
s.serverEndpointByVNI.Store(e.vni, e)
s.logf("allocated endpoint vni=%d lamportID=%d disco[0]=%v disco[1]=%v", e.vni, e.lamportID, pair.Get()[0].ShortString(), pair.Get()[1].ShortString())
s.metrics.updateEndpoint(endpointClosed, endpointConnecting)
return endpoint.ServerEndpoint{
ServerDisco: s.discoPublic,
ClientDisco: pair.Get(),