cmd/k8s-operator: fix Service reconcile triggers for default ProxyClass (#18983)
The e2e ingress test was very occasionally flaky. On looking at operator logs from one failure, you can see the default ProxyClass was not ready before the first reconcile loop for the exposed Service. The ProxyClass became ready soon after, but no additional reconciles were triggered for the exposed Service because we only triggered reconciles for Services that explicitly named their ProxyClass. This change adds additional list API calls for when it's the default ProxyClass that's been updated in order to catch Services that use it by default. It also adds indexes for the fields we need to search on to ensure the list is efficient. Fixes tailscale/corp#37533 Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
@@ -5,7 +5,6 @@ package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"testing"
|
||||
@@ -14,10 +13,6 @@ import (
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/yaml"
|
||||
|
||||
"tailscale.com/cmd/testwrapper/flakytest"
|
||||
kube "tailscale.com/k8s-operator"
|
||||
@@ -90,81 +85,20 @@ func TestIngress(t *testing.T) {
|
||||
}
|
||||
createAndCleanup(t, kubeClient, svc)
|
||||
|
||||
// TODO(tomhjp): Delete once we've reproduced the flake with this extra info.
|
||||
t0 := time.Now()
|
||||
watcherCtx, cancelWatcher := context.WithCancel(t.Context())
|
||||
defer cancelWatcher()
|
||||
go func() {
|
||||
// client-go client for logs.
|
||||
clientGoKubeClient, err := kubernetes.NewForConfig(restCfg)
|
||||
if err != nil {
|
||||
t.Logf("error creating client-go Kubernetes client: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-watcherCtx.Done():
|
||||
t.Logf("stopping watcher after %v", time.Since(t0))
|
||||
return
|
||||
case <-time.After(time.Minute):
|
||||
t.Logf("dumping info after %v elapsed", time.Since(t0))
|
||||
// Service itself.
|
||||
svc := &corev1.Service{ObjectMeta: objectMeta("default", "test-ingress")}
|
||||
err := get(watcherCtx, kubeClient, svc)
|
||||
svcYaml, _ := yaml.Marshal(svc)
|
||||
t.Logf("Service: %s, error: %v\n%s", svc.Name, err, string(svcYaml))
|
||||
|
||||
// Pods in tailscale namespace.
|
||||
var pods corev1.PodList
|
||||
if err := kubeClient.List(watcherCtx, &pods, client.InNamespace("tailscale")); err != nil {
|
||||
t.Logf("error listing Pods in tailscale namespace: %v", err)
|
||||
} else {
|
||||
t.Logf("%d Pods", len(pods.Items))
|
||||
for _, pod := range pods.Items {
|
||||
podYaml, _ := yaml.Marshal(pod)
|
||||
t.Logf("Pod: %s\n%s", pod.Name, string(podYaml))
|
||||
logs := clientGoKubeClient.CoreV1().Pods("tailscale").GetLogs(pod.Name, &corev1.PodLogOptions{}).Do(watcherCtx)
|
||||
logData, err := logs.Raw()
|
||||
if err != nil {
|
||||
t.Logf("error reading logs for Pod %s: %v", pod.Name, err)
|
||||
continue
|
||||
}
|
||||
t.Logf("Logs for Pod %s:\n%s", pod.Name, string(logData))
|
||||
}
|
||||
}
|
||||
|
||||
// Tailscale status on the tailnet.
|
||||
lc, err := tnClient.LocalClient()
|
||||
if err != nil {
|
||||
t.Logf("error getting tailnet local client: %v", err)
|
||||
} else {
|
||||
status, err := lc.Status(watcherCtx)
|
||||
statusJSON, _ := json.MarshalIndent(status, "", " ")
|
||||
t.Logf("Tailnet status: %s, error: %v", string(statusJSON), err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// TODO: instead of timing out only when test times out, cancel context after 60s or so.
|
||||
if err := wait.PollUntilContextCancel(t.Context(), time.Millisecond*100, true, func(ctx context.Context) (done bool, err error) {
|
||||
if time.Since(t0) > time.Minute {
|
||||
t.Logf("%v elapsed waiting for Service default/test-ingress to become Ready", time.Since(t0))
|
||||
}
|
||||
if err := tstest.WaitFor(time.Minute, func() error {
|
||||
maybeReadySvc := &corev1.Service{ObjectMeta: objectMeta("default", "test-ingress")}
|
||||
if err := get(ctx, kubeClient, maybeReadySvc); err != nil {
|
||||
return false, err
|
||||
if err := get(t.Context(), kubeClient, maybeReadySvc); err != nil {
|
||||
return err
|
||||
}
|
||||
isReady := kube.SvcIsReady(maybeReadySvc)
|
||||
if isReady {
|
||||
t.Log("Service is ready")
|
||||
return nil
|
||||
}
|
||||
return isReady, nil
|
||||
return fmt.Errorf("Service is not ready yet")
|
||||
}); err != nil {
|
||||
t.Fatalf("error waiting for the Service to become Ready: %v", err)
|
||||
}
|
||||
cancelWatcher()
|
||||
|
||||
var resp *http.Response
|
||||
if err := tstest.WaitFor(time.Minute, func() error {
|
||||
|
||||
@@ -56,6 +56,7 @@ import (
|
||||
"tailscale.com/internal/client/tailscale"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/store/mem"
|
||||
tsoperator "tailscale.com/k8s-operator"
|
||||
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
|
||||
"tailscale.com/tsnet"
|
||||
)
|
||||
@@ -438,7 +439,7 @@ func runTests(m *testing.M) (int, error) {
|
||||
return 0, fmt.Errorf("failed to install %q via helm: %w", relName, err)
|
||||
}
|
||||
|
||||
if err := applyDefaultProxyClass(ctx, kubeClient); err != nil {
|
||||
if err := applyDefaultProxyClass(ctx, logger, kubeClient); err != nil {
|
||||
return 0, fmt.Errorf("failed to apply default ProxyClass: %w", err)
|
||||
}
|
||||
|
||||
@@ -537,7 +538,7 @@ func tagForRepo(dir string) (string, error) {
|
||||
return tag, nil
|
||||
}
|
||||
|
||||
func applyDefaultProxyClass(ctx context.Context, cl client.Client) error {
|
||||
func applyDefaultProxyClass(ctx context.Context, logger *zap.SugaredLogger, cl client.Client) error {
|
||||
pc := &tsapi.ProxyClass{
|
||||
TypeMeta: metav1.TypeMeta{
|
||||
APIVersion: tsapi.SchemeGroupVersion.String(),
|
||||
@@ -565,6 +566,24 @@ func applyDefaultProxyClass(ctx context.Context, cl client.Client) error {
|
||||
return fmt.Errorf("failed to apply default ProxyClass: %w", err)
|
||||
}
|
||||
|
||||
// Wait for the ProxyClass to be marked ready.
|
||||
ctx, cancel := context.WithTimeout(ctx, time.Minute)
|
||||
defer cancel()
|
||||
for {
|
||||
if err := cl.Get(ctx, client.ObjectKeyFromObject(pc), pc); err != nil {
|
||||
return fmt.Errorf("failed to get default ProxyClass: %w", err)
|
||||
}
|
||||
if tsoperator.ProxyClassIsReady(pc) {
|
||||
break
|
||||
}
|
||||
logger.Info("waiting for default ProxyClass to be ready...")
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return fmt.Errorf("timeout waiting for default ProxyClass to be ready")
|
||||
case <-time.After(time.Second):
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user