net/connstats: enforce maximum number of connections (#6760)

The Tailscale logging service has a hard limit on the maximum
log message size that can be accepted.
We want to ensure that netlog messages never exceed
this limit otherwise a client cannot transmit logs.

Move the goroutine for periodically dumping netlog messages
from wgengine/netlog to net/connstats.
This allows net/connstats to manage when it dumps messages,
either based on time or by size.

Updates tailscale/corp#8427

Signed-off-by: Joe Tsai <joetsai@digital-static.net>
This commit is contained in:
Joe Tsai
2022-12-16 10:14:00 -08:00
committed by GitHub
parent 651e0d8aad
commit d9df023e6f
6 changed files with 210 additions and 110 deletions
+8 -4
View File
@@ -135,7 +135,7 @@ func runDERPAndStun(t *testing.T, logf logger.Logf, l nettype.PacketListener, st
type magicStack struct {
privateKey key.NodePrivate
epCh chan []tailcfg.Endpoint // endpoint updates produced by this peer
stats connstats.Statistics // per-connection statistics
stats *connstats.Statistics // per-connection statistics
conn *Conn // the magicsock itself
tun *tuntest.ChannelTUN // TUN device to send/receive packets
tsTun *tstun.Wrapper // wrapped tun that implements filtering and wgengine hooks
@@ -1053,11 +1053,15 @@ func testTwoDevicePing(t *testing.T, d *devices) {
}
}
m1.conn.SetStatistics(&m1.stats)
m2.conn.SetStatistics(&m2.stats)
m1.stats = connstats.NewStatistics(0, 0, nil)
defer m1.stats.Shutdown(context.Background())
m1.conn.SetStatistics(m1.stats)
m2.stats = connstats.NewStatistics(0, 0, nil)
defer m2.stats.Shutdown(context.Background())
m2.conn.SetStatistics(m2.stats)
checkStats := func(t *testing.T, m *magicStack, wantConns []netlogtype.Connection) {
_, stats := m.stats.Extract()
_, stats := m.stats.TestExtract()
for _, conn := range wantConns {
if _, ok := stats[conn]; ok {
return
+49 -73
View File
@@ -17,7 +17,6 @@ import (
"sync"
"time"
"golang.org/x/sync/errgroup"
"tailscale.com/logpolicy"
"tailscale.com/logtail"
"tailscale.com/net/connstats"
@@ -25,6 +24,7 @@ import (
"tailscale.com/smallzstd"
"tailscale.com/tailcfg"
"tailscale.com/types/netlogtype"
"tailscale.com/util/multierr"
"tailscale.com/wgengine/router"
)
@@ -32,8 +32,7 @@ import (
const pollPeriod = 5 * time.Second
// Device is an abstraction over a tunnel device or a magic socket.
// *tstun.Wrapper implements this interface.
// *magicsock.Conn implements this interface.
// Both *tstun.Wrapper and *magicsock.Conn implement this interface.
type Device interface {
SetStatistics(*connstats.Statistics)
}
@@ -47,15 +46,15 @@ func (noopDevice) SetStatistics(*connstats.Statistics) {}
// Exit node traffic is not logged for privacy reasons.
// The zero value is ready for use.
type Logger struct {
mu sync.Mutex
mu sync.Mutex // protects all fields below
logger *logtail.Logger
stats *connstats.Statistics
tun Device
sock Device
addrs map[netip.Addr]bool
prefixes map[netip.Prefix]bool
group errgroup.Group
cancel context.CancelFunc
}
// Running reports whether the logger is running.
@@ -97,18 +96,13 @@ func (nl *Logger) Startup(nodeID tailcfg.StableNodeID, nodeLogID, domainLogID lo
if nl.logger != nil {
return fmt.Errorf("network logger already running for %v", nl.logger.PrivateID().Public())
}
if tun == nil {
tun = noopDevice{}
}
if sock == nil {
sock = noopDevice{}
}
// Startup a log stream to Tailscale's logging service.
httpc := &http.Client{Transport: logpolicy.NewLogtailTransport(logtail.DefaultHost)}
if testClient != nil {
httpc = testClient
}
logger := logtail.NewLogger(logtail.Config{
nl.logger = logtail.NewLogger(logtail.Config{
Collection: "tailtraffic.log.tailscale.io",
PrivateID: nodeLogID,
CopyPrivateID: domainLogID,
@@ -127,47 +121,34 @@ func (nl *Logger) Startup(nodeID tailcfg.StableNodeID, nodeLogID, domainLogID lo
IncludeProcID: true,
IncludeProcSequence: true,
}, log.Printf)
nl.logger = logger
stats := new(connstats.Statistics)
ctx, cancel := context.WithCancel(context.Background())
nl.cancel = cancel
nl.group.Go(func() error {
tun.SetStatistics(stats)
defer tun.SetStatistics(nil)
sock.SetStatistics(stats)
defer sock.SetStatistics(nil)
start := time.Now()
ticker := time.NewTicker(pollPeriod)
for {
var end time.Time
select {
case <-ctx.Done():
end = time.Now()
case end = <-ticker.C:
}
// NOTE: connstats and sockStats will always be slightly out-of-sync.
// It is impossible to have an atomic snapshot of statistics
// at both layers without a global mutex that spans all layers.
connstats, sockStats := stats.Extract()
if len(connstats)+len(sockStats) > 0 {
nl.mu.Lock()
addrs := nl.addrs
prefixes := nl.prefixes
nl.mu.Unlock()
recordStatistics(logger, nodeID, start, end, connstats, sockStats, addrs, prefixes)
}
if ctx.Err() != nil {
break
}
start = end.Add(time.Nanosecond)
}
return nil
// Startup a data structure to track per-connection statistics.
// There is a maximum size for individual log messages that logtail
// can upload to the Tailscale log service, so stay below this limit.
const maxLogSize = 256 << 10
const maxConns = (maxLogSize - netlogtype.MaxMessageJSONSize) / netlogtype.MaxConnectionCountsJSONSize
nl.stats = connstats.NewStatistics(pollPeriod, maxConns, func(start, end time.Time, virtual, physical map[netlogtype.Connection]netlogtype.Counts) {
nl.mu.Lock()
addrs := nl.addrs
prefixes := nl.prefixes
nl.mu.Unlock()
recordStatistics(nl.logger, nodeID, start, end, virtual, physical, addrs, prefixes)
})
// Register the connection tracker into the TUN device.
if tun == nil {
tun = noopDevice{}
}
nl.tun = tun
nl.tun.SetStatistics(nl.stats)
// Register the connection tracker into magicsock.
if sock == nil {
sock = noopDevice{}
}
nl.sock = sock
nl.sock.SetStatistics(nl.stats)
return nil
}
@@ -222,21 +203,8 @@ func recordStatistics(logger *logtail.Logger, nodeID tailcfg.StableNodeID, start
}
if len(m.VirtualTraffic)+len(m.SubnetTraffic)+len(m.ExitTraffic)+len(m.PhysicalTraffic) > 0 {
// TODO(joetsai): Place a hard limit on the size of a network log message.
// The log server rejects any payloads above a certain size, so logging
// a message that large would cause logtail to be stuck forever trying
// and failing to upload the same excessively large payload.
//
// We should figure out the behavior for handling this. We could split
// the message apart so that there are multiple chunks with the same window,
// We could also consider reducing the granularity of the data
// by dropping port numbers.
const maxSize = 256 << 10
if b, err := json.Marshal(m); err != nil {
logger.Logf("json.Marshal error: %v", err)
} else if len(b) > maxSize {
logger.Logf("JSON body too large: %dB (virtual:%d subnet:%d exit:%d physical:%d)",
len(b), len(m.VirtualTraffic), len(m.SubnetTraffic), len(m.ExitTraffic), len(m.PhysicalTraffic))
} else {
logger.Logf("%s", b)
}
@@ -285,15 +253,23 @@ func (nl *Logger) Shutdown(ctx context.Context) error {
if nl.logger == nil {
return nil
}
nl.cancel()
nl.mu.Unlock()
nl.group.Wait() // do not hold lock while waiting
nl.mu.Lock()
err := nl.logger.Shutdown(ctx)
// Shutdown in reverse order of Startup.
// Do not hold lock while shutting down since this may flush one last time.
nl.mu.Unlock()
nl.sock.SetStatistics(nil)
nl.tun.SetStatistics(nil)
err1 := nl.stats.Shutdown(ctx)
err2 := nl.logger.Shutdown(ctx)
nl.mu.Lock()
// Purge state.
nl.logger = nil
nl.stats = nil
nl.tun = nil
nl.sock = nil
nl.addrs = nil
nl.prefixes = nil
nl.cancel = nil
return err
return multierr.New(err1, err2)
}