net/dns: retrample resolve.conf when another process has trampled it (#18069)

When using the resolve.conf file for setting DNS, it is possible that
some other services will trample the file and overwrite our set DNS
server. Experiments has shown this to be a racy error depending on how
quickly processes start.

Make an attempt to trample back the file a limited number of times if
the file is changed.

Updates #16635

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl
2025-12-09 14:55:26 -05:00
committed by GitHub
parent a9b37c510c
commit 1dfdee8521
17 changed files with 261 additions and 45 deletions
+35 -3
View File
@@ -21,6 +21,7 @@ import (
"slices"
"strings"
"sync"
"sync/atomic"
"time"
"tailscale.com/feature"
@@ -29,6 +30,7 @@ import (
"tailscale.com/net/tsaddr"
"tailscale.com/types/logger"
"tailscale.com/util/dnsname"
"tailscale.com/util/eventbus"
"tailscale.com/version/distro"
)
@@ -135,6 +137,11 @@ type directManager struct {
// but is better than having non-functioning DNS.
renameBroken bool
trampleCount atomic.Int64
trampleTimer *time.Timer
eventClient *eventbus.Client
trampleDNSPub *eventbus.Publisher[TrampleDNS]
ctx context.Context // valid until Close
ctxClose context.CancelFunc // closes ctx
@@ -145,11 +152,13 @@ type directManager struct {
}
//lint:ignore U1000 used in manager_{freebsd,openbsd}.go
func newDirectManager(logf logger.Logf, health *health.Tracker) *directManager {
return newDirectManagerOnFS(logf, health, directFS{})
func newDirectManager(logf logger.Logf, health *health.Tracker, bus *eventbus.Bus) *directManager {
return newDirectManagerOnFS(logf, health, bus, directFS{})
}
func newDirectManagerOnFS(logf logger.Logf, health *health.Tracker, fs wholeFileFS) *directManager {
var trampleWatchDuration = 5 * time.Second
func newDirectManagerOnFS(logf logger.Logf, health *health.Tracker, bus *eventbus.Bus, fs wholeFileFS) *directManager {
ctx, cancel := context.WithCancel(context.Background())
m := &directManager{
logf: logf,
@@ -158,6 +167,13 @@ func newDirectManagerOnFS(logf logger.Logf, health *health.Tracker, fs wholeFile
ctx: ctx,
ctxClose: cancel,
}
if bus != nil {
m.eventClient = bus.Client("dns.directManager")
m.trampleDNSPub = eventbus.Publish[TrampleDNS](m.eventClient)
}
m.trampleTimer = time.AfterFunc(trampleWatchDuration, func() {
m.trampleCount.Store(0)
})
go m.runFileWatcher()
return m
}
@@ -481,10 +497,26 @@ func (m *directManager) checkForFileTrample() {
}
m.logf("trample: resolv.conf changed from what we expected. did some other program interfere? current contents: %q", show)
m.health.SetUnhealthy(resolvTrampleWarnable, nil)
if m.trampleDNSPub != nil {
n := m.trampleCount.Add(1)
if n < 10 {
m.trampleDNSPub.Publish(TrampleDNS{
LastTrample: time.Now(),
TramplesInTimeout: n,
})
m.trampleTimer.Reset(trampleWatchDuration)
} else {
m.logf("trample: resolv.conf overwritten %d times, no longer attempting to replace it.", n)
}
}
}
func (m *directManager) Close() error {
m.ctxClose()
if m.eventClient != nil {
m.eventClient.Close()
}
// We used to keep a file for the tailscale config and symlinked
// to it, but then we stopped because /etc/resolv.conf being a