net/dns: recheck DNS config on SERVFAIL errors (#12547)
Fixes tailscale/corp#20677 Replaces the original attempt to rectify this (by injecting a netMon event) which was both heavy handed, and missed cases where the netMon event was "minor". On apple platforms, the fetching the interface's nameservers can and does return an empty list in certain situations. Apple's API in particular is very limiting here. The header hints at notifications for dns changes which would let us react ahead of time, but it's all private APIs. To avoid remaining in the state where we end up with no nameservers but we absolutely need them, we'll react to a lack of upstream nameservers by attempting to re-query the OS. We'll rate limit this to space out the attempts. It seems relatively harmless to attempt a reconfig every 5 seconds (triggered by an incoming query) if the network is in this broken state. Missing nameservers might possibly be a persistent condition (vs a transient error), but that would also imply that something out of our control is badly misconfigured. Tested by randomly returning [] for the nameservers. When switching between Wifi networks, or cell->wifi, this will randomly trigger the bug, and we appear to reliably heal the DNS state. Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
@@ -14,6 +14,7 @@ import (
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
@@ -23,6 +24,8 @@ import (
|
||||
"tailscale.com/net/dns/resolver"
|
||||
"tailscale.com/net/netmon"
|
||||
"tailscale.com/net/tsdial"
|
||||
"tailscale.com/syncs"
|
||||
"tailscale.com/tstime/rate"
|
||||
"tailscale.com/types/dnstype"
|
||||
"tailscale.com/types/logger"
|
||||
"tailscale.com/util/clientmetric"
|
||||
@@ -55,6 +58,12 @@ type Manager struct {
|
||||
os OSConfigurator
|
||||
knobs *controlknobs.Knobs // or nil
|
||||
goos string // if empty, gets set to runtime.GOOS
|
||||
|
||||
// The last configuration we successfully compiled. Set to nil if
|
||||
// there was any failure applying the last configuration
|
||||
config *Config
|
||||
// Must be held when accessing/setting config.
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// NewManagers created a new manager from the given config.
|
||||
@@ -80,6 +89,26 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
|
||||
knobs: knobs,
|
||||
goos: goos,
|
||||
}
|
||||
|
||||
// Rate limit our attempts to correct our DNS configuration.
|
||||
limiter := rate.NewLimiter(1.0/5.0, 1)
|
||||
|
||||
// This will recompile the DNS config, which in turn will requery the system
|
||||
// DNS settings. The recovery func should triggered only when we are missing
|
||||
// upstream nameservers and require them to forward a query.
|
||||
m.resolver.SetMissingUpstreamRecovery(func() {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
if m.config == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if limiter.Allow() {
|
||||
m.logf("DNS resolution failed due to missing upstream nameservers. Recompiling DNS configuration.")
|
||||
m.setLocked(*m.config)
|
||||
}
|
||||
})
|
||||
|
||||
m.ctx, m.ctxCancel = context.WithCancel(context.Background())
|
||||
m.logf("using %T", m.os)
|
||||
return m
|
||||
@@ -89,6 +118,19 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
|
||||
func (m *Manager) Resolver() *resolver.Resolver { return m.resolver }
|
||||
|
||||
func (m *Manager) Set(cfg Config) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.setLocked(cfg)
|
||||
}
|
||||
|
||||
// Sets the DNS configuration.
|
||||
// m.mu must be held
|
||||
func (m *Manager) setLocked(cfg Config) error {
|
||||
syncs.AssertLocked(&m.mu)
|
||||
|
||||
// On errors, the 'set' config is cleared.
|
||||
m.config = nil
|
||||
|
||||
m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) {
|
||||
cfg.WriteToBufioWriter(w)
|
||||
}))
|
||||
@@ -112,7 +154,9 @@ func (m *Manager) Set(cfg Config) error {
|
||||
m.health.SetDNSOSHealth(err)
|
||||
return err
|
||||
}
|
||||
|
||||
m.health.SetDNSOSHealth(nil)
|
||||
m.config = &cfg
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user