So users can see why things aren't working. A start. More diagnostics coming. Updates #1094main
parent
5eeaea9ef9
commit
85e54af0d7
@ -0,0 +1,157 @@ |
||||
// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package wgengine |
||||
|
||||
import ( |
||||
"os" |
||||
"strconv" |
||||
"time" |
||||
|
||||
"tailscale.com/net/flowtrack" |
||||
"tailscale.com/net/packet" |
||||
"tailscale.com/wgengine/filter" |
||||
"tailscale.com/wgengine/tstun" |
||||
) |
||||
|
||||
const tcpTimeoutBeforeDebug = 5 * time.Second |
||||
|
||||
// debugConnectFailures reports whether the local node should track
|
||||
// outgoing TCP connections and log which ones fail and why.
|
||||
func debugConnectFailures() bool { |
||||
s := os.Getenv("TS_DEBUG_CONNECT_FAILURES") |
||||
if s == "" { |
||||
return true |
||||
} |
||||
v, _ := strconv.ParseBool(s) |
||||
return v |
||||
} |
||||
|
||||
type pendingOpenFlow struct { |
||||
timer *time.Timer // until giving up on the flow
|
||||
} |
||||
|
||||
func (e *userspaceEngine) trackOpenPreFilterIn(pp *packet.Parsed, t *tstun.TUN) (res filter.Response) { |
||||
res = filter.Accept // always
|
||||
|
||||
if pp.IPVersion == 0 || |
||||
pp.IPProto != packet.TCP || |
||||
pp.TCPFlags&(packet.TCPSyn|packet.TCPRst) == 0 { |
||||
return |
||||
} |
||||
|
||||
flow := flowtrack.Tuple{Dst: pp.Src, Src: pp.Dst} // src/dst reversed
|
||||
|
||||
e.mu.Lock() |
||||
defer e.mu.Unlock() |
||||
of, ok := e.pendOpen[flow] |
||||
if !ok { |
||||
// Not a tracked flow.
|
||||
return |
||||
} |
||||
of.timer.Stop() |
||||
delete(e.pendOpen, flow) |
||||
|
||||
if pp.TCPFlags&packet.TCPRst != 0 { |
||||
// TODO(bradfitz): have peer send a IP proto 99 "why"
|
||||
// packet first with details and log that instead
|
||||
// (e.g. ACL prohibited, shields up, etc).
|
||||
e.logf("open-conn-track: flow %v got RST by peer", flow) |
||||
return |
||||
} |
||||
|
||||
return |
||||
} |
||||
|
||||
func (e *userspaceEngine) trackOpenPostFilterOut(pp *packet.Parsed, t *tstun.TUN) (res filter.Response) { |
||||
res = filter.Accept // always
|
||||
|
||||
if pp.IPVersion == 0 || |
||||
pp.IPProto != packet.TCP || |
||||
pp.TCPFlags&packet.TCPSyn == 0 { |
||||
return |
||||
} |
||||
|
||||
flow := flowtrack.Tuple{Src: pp.Src, Dst: pp.Dst} |
||||
timer := time.AfterFunc(tcpTimeoutBeforeDebug, func() { |
||||
e.onOpenTimeout(flow) |
||||
}) |
||||
|
||||
e.mu.Lock() |
||||
defer e.mu.Unlock() |
||||
if e.pendOpen == nil { |
||||
e.pendOpen = make(map[flowtrack.Tuple]*pendingOpenFlow) |
||||
} |
||||
if _, dup := e.pendOpen[flow]; dup { |
||||
// Duplicates are expected when the OS retransmits. Ignore.
|
||||
return |
||||
} |
||||
e.pendOpen[flow] = &pendingOpenFlow{timer: timer} |
||||
|
||||
return filter.Accept |
||||
} |
||||
|
||||
func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) { |
||||
e.mu.Lock() |
||||
if _, ok := e.pendOpen[flow]; !ok { |
||||
// Not a tracked flow, or already handled & deleted.
|
||||
e.mu.Unlock() |
||||
return |
||||
} |
||||
delete(e.pendOpen, flow) |
||||
e.mu.Unlock() |
||||
|
||||
// Diagnose why it might've timed out.
|
||||
n, ok := e.magicConn.PeerForIP(flow.Dst.IP) |
||||
if !ok { |
||||
e.logf("open-conn-track: timeout opening %v; no associated peer node", flow) |
||||
return |
||||
} |
||||
if n.DiscoKey.IsZero() { |
||||
e.logf("open-conn-track: timeout opening %v; peer node %v running pre-0.100", flow, n.Key.ShortString()) |
||||
return |
||||
} |
||||
if n.DERP == "" { |
||||
e.logf("open-conn-track: timeout opening %v; peer node %v not connected to any DERP relay", flow, n.Key.ShortString()) |
||||
return |
||||
} |
||||
var lastSeen time.Time |
||||
if n.LastSeen != nil { |
||||
lastSeen = *n.LastSeen |
||||
} |
||||
|
||||
var ps *PeerStatus |
||||
if st, err := e.getStatus(); err == nil { |
||||
for _, v := range st.Peers { |
||||
if v.NodeKey == n.Key { |
||||
v := v // copy
|
||||
ps = &v |
||||
} |
||||
} |
||||
} else { |
||||
e.logf("open-conn-track: timeout opening %v to node %v; failed to get engine status: %v", flow, n.Key.ShortString(), err) |
||||
return |
||||
} |
||||
if ps == nil { |
||||
e.logf("open-conn-track: timeout opening %v; target node %v in netmap but unknown to wireguard", flow, n.Key.ShortString()) |
||||
return |
||||
} |
||||
|
||||
// TODO(bradfitz): figure out what PeerStatus.LastHandshake
|
||||
// is. It appears to be the last time we sent a handshake,
|
||||
// which isn't what I expected. I thought it was when a
|
||||
// handshake completed, which is what I want.
|
||||
_ = ps.LastHandshake |
||||
|
||||
e.logf("open-conn-track: timeout opening %v to node %v; lastSeen=%v, lastRecv=%v", |
||||
flow, n.Key.ShortString(), |
||||
agoOrNever(lastSeen), agoOrNever(e.magicConn.LastRecvActivityOfDisco(n.DiscoKey))) |
||||
} |
||||
|
||||
func agoOrNever(t time.Time) string { |
||||
if t.IsZero() { |
||||
return "never" |
||||
} |
||||
return time.Since(t).Round(time.Second).String() |
||||
} |
||||
Loading…
Reference in new issue