net/dns: retrample resolve.conf when another process has trampled it (#18069)

When using the resolve.conf file for setting DNS, it is possible that
some other services will trample the file and overwrite our set DNS
server. Experiments has shown this to be a racy error depending on how
quickly processes start.

Make an attempt to trample back the file a limited number of times if
the file is changed.

Updates #16635

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl
2025-12-09 14:55:26 -05:00
committed by GitHub
parent a9b37c510c
commit 1dfdee8521
17 changed files with 261 additions and 45 deletions
+39 -10
View File
@@ -55,6 +55,8 @@ type Manager struct {
logf logger.Logf
health *health.Tracker
eventClient *eventbus.Client
activeQueriesAtomic int32
ctx context.Context // good until Down
@@ -69,10 +71,10 @@ type Manager struct {
config *Config // Tracks the last viable DNS configuration set by Set. nil on failures other than compilation failures or if set has never been called.
}
// NewManagers created a new manager from the given config.
// NewManager created a new manager from the given config.
//
// knobs may be nil.
func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, dialer *tsdial.Dialer, linkSel resolver.ForwardLinkSelector, knobs *controlknobs.Knobs, goos string) *Manager {
func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, dialer *tsdial.Dialer, linkSel resolver.ForwardLinkSelector, knobs *controlknobs.Knobs, goos string, bus *eventbus.Bus) *Manager {
if !buildfeatures.HasDNS {
return nil
}
@@ -96,6 +98,20 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
goos: goos,
}
m.eventClient = bus.Client("dns.Manager")
eventbus.SubscribeFunc(m.eventClient, func(trample TrampleDNS) {
m.mu.Lock()
defer m.mu.Unlock()
if m.config == nil {
m.logf("resolve.conf was trampled, but there is no DNS config")
return
}
m.logf("resolve.conf was trampled, setting existing config again")
if err := m.setLocked(*m.config); err != nil {
m.logf("error setting DNS config: %s", err)
}
})
m.ctx, m.ctxCancel = context.WithCancel(context.Background())
m.logf("using %T", m.os)
return m
@@ -178,9 +194,7 @@ func (m *Manager) setLocked(cfg Config) error {
m.config = nil
return err
}
if err := m.os.SetDNS(ocfg); err != nil {
m.config = nil
m.health.SetUnhealthy(osConfigurationSetWarnable, health.Args{health.ArgError: err.Error()})
if err := m.setDNSLocked(ocfg); err != nil {
return err
}
@@ -190,6 +204,15 @@ func (m *Manager) setLocked(cfg Config) error {
return nil
}
func (m *Manager) setDNSLocked(ocfg OSConfig) error {
if err := m.os.SetDNS(ocfg); err != nil {
m.config = nil
m.health.SetUnhealthy(osConfigurationSetWarnable, health.Args{health.ArgError: err.Error()})
return err
}
return nil
}
// compileHostEntries creates a list of single-label resolutions possible
// from the configured hosts and search domains.
// The entries are compiled in the order of the search domains, then the hosts.
@@ -457,6 +480,13 @@ const (
maxReqSizeTCP = 4096
)
// TrampleDNS is an an event indicating we detected that DNS config was
// overwritten by another process.
type TrampleDNS struct {
LastTrample time.Time
TramplesInTimeout int64
}
// dnsTCPSession services DNS requests sent over TCP.
type dnsTCPSession struct {
m *Manager
@@ -585,6 +615,7 @@ func (m *Manager) Down() error {
if err := m.os.Close(); err != nil {
return err
}
m.eventClient.Close()
m.resolver.Close()
return nil
}
@@ -605,7 +636,7 @@ func CleanUp(logf logger.Logf, netMon *netmon.Monitor, bus *eventbus.Bus, health
if !buildfeatures.HasDNS {
return
}
oscfg, err := NewOSConfigurator(logf, health, policyclient.Get(), nil, interfaceName)
oscfg, err := NewOSConfigurator(logf, health, bus, policyclient.Get(), nil, interfaceName)
if err != nil {
logf("creating dns cleanup: %v", err)
return
@@ -613,12 +644,10 @@ func CleanUp(logf logger.Logf, netMon *netmon.Monitor, bus *eventbus.Bus, health
d := &tsdial.Dialer{Logf: logf}
d.SetNetMon(netMon)
d.SetBus(bus)
dns := NewManager(logf, oscfg, health, d, nil, nil, runtime.GOOS)
dns := NewManager(logf, oscfg, health, d, nil, nil, runtime.GOOS, bus)
if err := dns.Down(); err != nil {
logf("dns down: %v", err)
}
}
var (
metricDNSQueryErrorQueue = clientmetric.NewCounter("dns_query_local_error_queue")
)
var metricDNSQueryErrorQueue = clientmetric.NewCounter("dns_query_local_error_queue")