util/eventbus: allow logging of slow subscribers (#17705)

Add options to the eventbus.Bus to plumb in a logger.

Route that logger in to the subscriber machinery, and trigger a log message to
it when a subscriber fails to respond to its delivered events for 5s or more.

The log message includes the package, filename, and line number of the call
site that created the subscription.

Add tests that verify this works.

Updates #17680

Change-Id: I0546516476b1e13e6a9cf79f19db2fe55e56c698
Signed-off-by: M. J. Fromberger <fromberger@tailscale.com>
This commit is contained in:
M. J. Fromberger
2025-10-30 14:40:57 -07:00
committed by GitHub
parent f522b9dbb7
commit 061e6266cf
10 changed files with 185 additions and 13 deletions
+33 -2
View File
@@ -8,6 +8,9 @@ import (
"fmt"
"reflect"
"sync"
"time"
"tailscale.com/types/logger"
)
type DeliveredEvent struct {
@@ -182,12 +185,18 @@ type Subscriber[T any] struct {
stop stopFlag
read chan T
unregister func()
logf logger.Logf
slow *time.Timer // used to detect slow subscriber service
}
func newSubscriber[T any](r *subscribeState) *Subscriber[T] {
func newSubscriber[T any](r *subscribeState, logf logger.Logf) *Subscriber[T] {
slow := time.NewTimer(0)
slow.Stop() // reset in dispatch
return &Subscriber[T]{
read: make(chan T),
unregister: func() { r.deleteSubscriber(reflect.TypeFor[T]()) },
logf: logf,
slow: slow,
}
}
@@ -212,6 +221,11 @@ func (s *Subscriber[T]) monitor(debugEvent T) {
func (s *Subscriber[T]) dispatch(ctx context.Context, vals *queue[DeliveredEvent], acceptCh func() chan DeliveredEvent, snapshot chan chan []DeliveredEvent) bool {
t := vals.Peek().Event.(T)
start := time.Now()
s.slow.Reset(slowSubscriberTimeout)
defer s.slow.Stop()
for {
// Keep the cases in this select in sync with subscribeState.pump
// above. The only difference should be that this select
@@ -226,6 +240,9 @@ func (s *Subscriber[T]) dispatch(ctx context.Context, vals *queue[DeliveredEvent
return false
case ch := <-snapshot:
ch <- vals.Snapshot()
case <-s.slow.C:
s.logf("subscriber for %T is slow (%v elapsed)", t, time.Since(start))
s.slow.Reset(slowSubscriberTimeout)
}
}
}
@@ -260,12 +277,18 @@ type SubscriberFunc[T any] struct {
stop stopFlag
read func(T)
unregister func()
logf logger.Logf
slow *time.Timer // used to detect slow subscriber service
}
func newSubscriberFunc[T any](r *subscribeState, f func(T)) *SubscriberFunc[T] {
func newSubscriberFunc[T any](r *subscribeState, f func(T), logf logger.Logf) *SubscriberFunc[T] {
slow := time.NewTimer(0)
slow.Stop() // reset in dispatch
return &SubscriberFunc[T]{
read: f,
unregister: func() { r.deleteSubscriber(reflect.TypeFor[T]()) },
logf: logf,
slow: slow,
}
}
@@ -285,6 +308,11 @@ func (s *SubscriberFunc[T]) dispatch(ctx context.Context, vals *queue[DeliveredE
t := vals.Peek().Event.(T)
callDone := make(chan struct{})
go s.runCallback(t, callDone)
start := time.Now()
s.slow.Reset(slowSubscriberTimeout)
defer s.slow.Stop()
// Keep the cases in this select in sync with subscribeState.pump
// above. The only difference should be that this select
// delivers a value by calling s.read.
@@ -299,6 +327,9 @@ func (s *SubscriberFunc[T]) dispatch(ctx context.Context, vals *queue[DeliveredE
return false
case ch := <-snapshot:
ch <- vals.Snapshot()
case <-s.slow.C:
s.logf("subscriber for %T is slow (%v elapsed)", t, time.Since(start))
s.slow.Reset(slowSubscriberTimeout)
}
}
}