From 4d68493144369198ec908d79b5c901ed3fb90fa8 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Wed, 13 May 2026 16:51:18 -0500 Subject: [PATCH] health: avoid publishing health.Change when warnable visibility remains unchanged Warnables with a non-zero TimeToVisible are only published on the eventbus when they remain unhealthy long enough to become visible. However, we still publish a health.Change when a warning that was never visible (and was never published to the eventbus) becomes healthy. This PR fixes that and reduces churn when there is no actual state change. In particular, it avoids unnecessary IPN bus notifications sent to GUI/CLI clients, captive portal detection, etc. Updates tailscale/corp#39759 (noticed while working on it) Signed-off-by: Nick Khyl --- health/health.go | 28 +++++++-- health/health_test.go | 129 ++++++++++++++++++++++++++++-------------- 2 files changed, 109 insertions(+), 48 deletions(-) diff --git a/health/health.go b/health/health.go index 1829bd482..7e2878159 100644 --- a/health/health.go +++ b/health/health.go @@ -492,7 +492,12 @@ func (t *Tracker) SetHealthy(w *Warnable) { } func (t *Tracker) setHealthyLocked(w *Warnable) { - if !buildfeatures.HasHealth || t.warnableVal[w] == nil { + if !buildfeatures.HasHealth { + return + } + + ws := t.warnableVal[w] + if ws == nil { // Nothing to remove return } @@ -501,15 +506,28 @@ func (t *Tracker) setHealthyLocked(w *Warnable) { // Stop any pending visiblity timers for this Warnable if canc, ok := t.pendingVisibleTimers[w]; ok { + // We removed the warningState for this Warnable, + // and we hold the lock, so even if the timer callback + // has already started, it won't find a warningState + // for this Warnable and won't publish any changes. canc.Stop() delete(t.pendingVisibleTimers, w) } - change := Change{ - WarnableChanged: true, - Warnable: w, + // Only publish a change if the Warnable was unhealthy long + // enough to become visible to the user. Otherwise, it would + // not have been published as unhealthy, so there is no need + // to publish it as healthy. This prevents eventbus (and by + // extension the IPN bus) churn for Warnables that are marked + // unhealthy and then healthy again. Notably, this includes + // warnables touched by [Tracker.updateBuiltinWarnablesLocked]. + if w.IsVisible(ws, t.now) { + change := Change{ + WarnableChanged: true, + Warnable: w, + } + t.changePub.Publish(change) } - t.changePub.Publish(change) } // notifyWatchersControlChangedLocked calls each watcher to signal that control diff --git a/health/health_test.go b/health/health_test.go index ccd49b19a..4b5af6d76 100644 --- a/health/health_test.go +++ b/health/health_test.go @@ -271,47 +271,91 @@ func TestSetUnhealthyWithTimeToVisible(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(*testing.T) { - bus := eventbustest.NewBus(t) - ht := NewTracker(bus) - mw := Register(&Warnable{ - Code: "test-warnable-3-secs-to-visible", - Title: "Test Warnable with 3 seconds to visible", - Text: StaticMessage("Hello world"), - TimeToVisible: 2 * time.Second, - ImpactsConnectivity: true, + synctest.Test(t, func(t *testing.T) { + clock := tstest.NewClock(tstest.ClockOpts{ + Start: time.Unix(123, 0), + FollowRealTime: false, + }) + bus := eventbustest.NewBus(t) + ht := NewTracker(bus) + ht.testClock = clock + mw := Register(&Warnable{ + Code: "test-warnable-3-secs-to-visible", + Title: "Test Warnable with 3 seconds to visible", + Text: StaticMessage("Hello world"), + TimeToVisible: 2 * time.Second, + ImpactsConnectivity: true, + }) + defer unregister(mw) + + becameUnhealthy := make(chan struct{}) + becameHealthy := make(chan struct{}) + + watchFunc := func(c Change) { + w := c.Warnable + us := c.UnhealthyState + if w != mw { + t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, w) + } + + if us != nil { + becameUnhealthy <- struct{}{} + } else { + becameHealthy <- struct{}{} + } + } + + tt.preFunc(t, ht, bus, watchFunc) + ht.SetUnhealthy(mw, Args{ArgError: "Hello world"}) + + // Advance time by half of the TimeToVisible duration. + clock.Advance(mw.TimeToVisible / 2) + + select { + case <-becameUnhealthy: + // Test failed because the watcher got notified of an unhealthy state + t.Fatalf("watcherFunc was called with an unhealthy state") + case <-becameHealthy: + // Test failed because the watcher got of a healthy state + t.Fatalf("watcherFunc was called with a healthy state") + default: + // As expected, watcherFunc still had not been called + // after mw.TimeToVisible / 2. + } + + // Advance time to get past the the TimeToVisible duration. + // The watcher should be notified of the unhealthy state. + clock.Advance(mw.TimeToVisible/2 + 1) + <-becameUnhealthy + + // Reset the warnable to neutral / healthy state before + // the next part of the test. + ht.SetHealthy(mw) + <-becameHealthy + + // Mark the warnable unhealthy and then immediately healthy + // before the TimeToVisible duration elapses. + // The watcher should not be notified of either change + // because the warnable never became visible. + ht.SetUnhealthy(mw, Args{ArgError: "Hello world"}) + ht.SetHealthy(mw) + + // Advance to get past the the TimeToVisible delay. + clock.Advance(mw.TimeToVisible * 2) + synctest.Wait() + + select { + case <-becameUnhealthy: + // Test failed because the watcher got notified of an unhealthy state + t.Fatalf("watcherFunc was called with an unhealthy state") + case <-becameHealthy: + // Test failed because the watcher got of a healthy state + t.Fatalf("watcherFunc was called with a healthy state") + default: + // As expected, watcherFunc was not called after marking + // the warnable healthy again as it never became visible. + } }) - - becameUnhealthy := make(chan struct{}) - becameHealthy := make(chan struct{}) - - watchFunc := func(c Change) { - w := c.Warnable - us := c.UnhealthyState - if w != mw { - t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, w) - } - - if us != nil { - becameUnhealthy <- struct{}{} - } else { - becameHealthy <- struct{}{} - } - } - - tt.preFunc(t, ht, bus, watchFunc) - ht.SetUnhealthy(mw, Args{ArgError: "Hello world"}) - - select { - case <-becameUnhealthy: - // Test failed because the watcher got notified of an unhealthy state - t.Fatalf("watcherFunc was called with an unhealthy state") - case <-becameHealthy: - // Test failed because the watcher got of a healthy state - t.Fatalf("watcherFunc was called with a healthy state") - case <-time.After(1 * time.Second): - // As expected, watcherFunc still had not been called after 1 second - } - unregister(mw) }) } } @@ -739,12 +783,11 @@ func TestControlHealthNotifies(t *testing.T) { ht.SetIPNState("NeedsLogin", true) ht.GotStreamedMapResponse() - // Expect events at starup, before doing anything else, skip unstable - // event and no warning event as they show up at different times. + // Expect events at starup, before doing anything else, skip + // the warming up events. synctest.Wait() if err := eventbustest.Expect(tw, CompareWarnableCode(t, tsconst.HealthWarnableWarmingUp), - CompareWarnableCode(t, tsconst.HealthWarnableNotInMapPoll), CompareWarnableCode(t, tsconst.HealthWarnableWarmingUp), ); err != nil { t.Errorf("startup error: %v", err)