control/controlclient: improve handling of concurrent lite map requests

This reverts commit 6eca47b16c and fixes forward.

Previously the first ever streaming MapRequest that a client sent would also
set ReadOnly to true as it didn't have any endpoints and expected/relied on the
map poll to restart as soon as it got endpoints. However with 48f6c1eba4,
we would no longer restart MapRequests as frequently as we used to, so control
would only ever get the first streaming MapRequest which had ReadOnly=true.

Control would treat this as an uninteresting request and would not send it
any further netmaps, while the client would happily stay in the map poll forever
while litemap updates happened in parallel.

This makes it so that we never set `ReadOnly=true` when we are doing a streaming
MapRequest. This is no longer necessary either as most endpoint discovery happens
over disco anyway.

Co-authored-by: Andrew Dunham <andrew@du.nham.ca>
Signed-off-by: Maisem Ali <maisem@tailscale.com>
This commit is contained in:
Maisem Ali
2023-03-08 17:15:47 -08:00
committed by Maisem Ali
parent 87b4bbb94f
commit be027a9899
3 changed files with 77 additions and 44 deletions
+56 -17
View File
@@ -59,15 +59,17 @@ type Auto struct {
mu sync.Mutex // mutex guards the following fields
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan struct{}
loggedIn bool // true if currently logged in
loginGoal *LoginGoal // non-nil if some login activity is desired
synced bool // true if our netmap is up-to-date
inPollNetMap bool // true if currently running a PollNetMap
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
inSendStatus int // number of sendStatus calls currently in progress
state State
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan struct{}
loggedIn bool // true if currently logged in
loginGoal *LoginGoal // non-nil if some login activity is desired
synced bool // true if our netmap is up-to-date
inPollNetMap bool // true if currently running a PollNetMap
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
liteMapUpdateCancel context.CancelFunc // cancels a lite map update, may be nil
liteMapUpdateCancels int // how many times we've canceled a lite map update
inSendStatus int // number of sendStatus calls currently in progress
state State
authCtx context.Context // context used for auth requests
mapCtx context.Context // context used for netmap requests
@@ -168,28 +170,56 @@ func (c *Auto) Start() {
func (c *Auto) sendNewMapRequest() {
c.mu.Lock()
// If we're not already streaming a netmap, or if we're already stuck
// in a lite update, then tear down everything and start a new stream
// (which starts by sending a new map request)
if !c.inPollNetMap || c.inLiteMapUpdate || !c.loggedIn {
// If we're not already streaming a netmap, then tear down everything
// and start a new stream (which starts by sending a new map request)
if !c.inPollNetMap || !c.loggedIn {
c.mu.Unlock()
c.cancelMapSafely()
return
}
// If we are already in process of doing a LiteMapUpdate, cancel it and
// try a new one. If this is the 10th time we have done this
// cancelation, tear down everything and start again.
const maxLiteMapUpdateAttempts = 10
if c.inLiteMapUpdate {
// Always cancel the in-flight lite map update, regardless of
// whether we cancel the streaming map request or not.
c.liteMapUpdateCancel()
c.inLiteMapUpdate = false
if c.liteMapUpdateCancels >= maxLiteMapUpdateAttempts {
// Not making progress
c.mu.Unlock()
c.cancelMapSafely()
return
}
// Increment our cancel counter and continue below to start a
// new lite update.
c.liteMapUpdateCancels++
}
// Otherwise, send a lite update that doesn't keep a
// long-running stream response.
defer c.mu.Unlock()
c.inLiteMapUpdate = true
ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second)
c.liteMapUpdateCancel = cancel
go func() {
defer cancel()
t0 := time.Now()
err := c.direct.SendLiteMapUpdate(ctx)
d := time.Since(t0).Round(time.Millisecond)
c.mu.Lock()
c.inLiteMapUpdate = false
c.liteMapUpdateCancel = nil
if err == nil {
c.liteMapUpdateCancels = 0
}
c.mu.Unlock()
if err == nil {
c.logf("[v1] successful lite map update in %v", d)
return
@@ -197,10 +227,13 @@ func (c *Auto) sendNewMapRequest() {
if ctx.Err() == nil {
c.logf("lite map update after %v: %v", d, err)
}
// Fall back to restarting the long-polling map
// request (the old heavy way) if the lite update
// failed for any reason.
c.cancelMapSafely()
if !errors.Is(ctx.Err(), context.Canceled) {
// Fall back to restarting the long-polling map
// request (the old heavy way) if the lite update
// failed for reasons other than the context being
// canceled.
c.cancelMapSafely()
}
}()
}
@@ -237,6 +270,12 @@ func (c *Auto) cancelMapSafely() {
c.mu.Lock()
defer c.mu.Unlock()
// Always reset our lite map cancels counter if we're canceling
// everything, since we're about to restart with a new map update; this
// allows future calls to sendNewMapRequest to retry sending lite
// updates.
c.liteMapUpdateCancels = 0
c.logf("[v1] cancelMapSafely: synced=%v", c.synced)
if c.inPollNetMap {