tsconsensus: fix race condition in TestOnlyTaggedPeersCanBeDialed

TestOnlyTaggedPeersCanBeDialed has a race condition:
- The test untags ps[2] and waits until ps[0] sees this tag dropped from
  ps[2] in the netmap.
- Later the test tries to dial ps[2] from ps[0] and expects the dial to
  fail as authorization to dial relies on the presence of the tag, now
  removed from ps[2].
- However, the authorization layer caches the status used to consult peer
  tags. When the dial happens before the cache times out, the test fails.
- Due to a bug in testcontrol.Server.UpdateNode, which the test uses to
  remove the tag, netmap updates are not immediately triggered. The test
  has to wait for the next natural set of netmap updates, which on my
  machine takes about 22 seconds. As a result, the cache in the
  authorization layer times out and the test passes.
- If one fixes the bug in UpdateNode, then netmap updates happen
  immediately, the cache is no longer timed out when the dial occurs, and
  the test fails.

Fixes #18720
Updates #18703

Signed-off-by: Harry Harpham <harry@tailscale.com>
main
Harry Harpham 2 months ago
parent a7a864419d
commit fbbf0d6669
  1. 21
      tsconsensus/authorization.go
  2. 6
      tsconsensus/tsconsensus_test.go

@ -17,6 +17,10 @@ import (
"tailscale.com/util/set" "tailscale.com/util/set"
) )
// defaultStatusCacheTimeout is the duration after which cached status will be
// disregarded. See tailscaleStatusGetter.cacheTimeout.
const defaultStatusCacheTimeout = time.Second
type statusGetter interface { type statusGetter interface {
getStatus(context.Context) (*ipnstate.Status, error) getStatus(context.Context) (*ipnstate.Status, error)
} }
@ -24,6 +28,10 @@ type statusGetter interface {
type tailscaleStatusGetter struct { type tailscaleStatusGetter struct {
ts *tsnet.Server ts *tsnet.Server
// cacheTimeout is used to determine when the cached status should be
// disregarded and a new status fetched. Zero means ignore the cache.
cacheTimeout time.Duration
mu sync.Mutex // protects the following mu sync.Mutex // protects the following
lastStatus *ipnstate.Status lastStatus *ipnstate.Status
lastStatusTime time.Time lastStatusTime time.Time
@ -40,7 +48,7 @@ func (sg *tailscaleStatusGetter) fetchStatus(ctx context.Context) (*ipnstate.Sta
func (sg *tailscaleStatusGetter) getStatus(ctx context.Context) (*ipnstate.Status, error) { func (sg *tailscaleStatusGetter) getStatus(ctx context.Context) (*ipnstate.Status, error) {
sg.mu.Lock() sg.mu.Lock()
defer sg.mu.Unlock() defer sg.mu.Unlock()
if sg.lastStatus != nil && time.Since(sg.lastStatusTime) < 1*time.Second { if sg.lastStatus != nil && time.Since(sg.lastStatusTime) < sg.cacheTimeout {
return sg.lastStatus, nil return sg.lastStatus, nil
} }
status, err := sg.fetchStatus(ctx) status, err := sg.fetchStatus(ctx)
@ -61,14 +69,23 @@ type authorization struct {
} }
func newAuthorization(ts *tsnet.Server, tag string) *authorization { func newAuthorization(ts *tsnet.Server, tag string) *authorization {
return newAuthorizationWithCacheTimeout(ts, tag, defaultStatusCacheTimeout)
}
func newAuthorizationWithCacheTimeout(ts *tsnet.Server, tag string, cacheTimeout time.Duration) *authorization {
return &authorization{ return &authorization{
sg: &tailscaleStatusGetter{ sg: &tailscaleStatusGetter{
ts: ts, ts: ts,
cacheTimeout: cacheTimeout,
}, },
tag: tag, tag: tag,
} }
} }
func newAuthorizationForTest(ts *tsnet.Server, tag string) *authorization {
return newAuthorizationWithCacheTimeout(ts, tag, 0)
}
func (a *authorization) Refresh(ctx context.Context) error { func (a *authorization) Refresh(ctx context.Context) error {
tStatus, err := a.sg.getStatus(ctx) tStatus, err := a.sg.getStatus(ctx)
if err != nil { if err != nil {

@ -642,7 +642,7 @@ func TestOnlyTaggedPeersCanBeDialed(t *testing.T) {
// make a StreamLayer for ps[0] // make a StreamLayer for ps[0]
ts := ps[0].ts ts := ps[0].ts
auth := newAuthorization(ts, clusterTag) auth := newAuthorizationForTest(ts, clusterTag)
port := 19841 port := 19841
lns := make([]net.Listener, 3) lns := make([]net.Listener, 3)
@ -692,10 +692,12 @@ func TestOnlyTaggedPeersCanBeDialed(t *testing.T) {
conn.Close() conn.Close()
_, err = sl.Dial(a2, 2*time.Second) _, err = sl.Dial(a2, 2*time.Second)
if err == nil {
t.Fatal("expected dial error to untagged node, got none")
}
if err.Error() != "dial: peer is not allowed" { if err.Error() != "dial: peer is not allowed" {
t.Fatalf("expected dial: peer is not allowed, got: %v", err) t.Fatalf("expected dial: peer is not allowed, got: %v", err)
} }
} }
func TestOnlyTaggedPeersCanJoin(t *testing.T) { func TestOnlyTaggedPeersCanJoin(t *testing.T) {

Loading…
Cancel
Save