From 25a8227291715a499ce4ed957d6d9f79d041b46a Mon Sep 17 00:00:00 2001 From: James Bardin Date: Sun, 8 Oct 2017 12:57:11 -0400 Subject: [PATCH] add broken test for lock lost on connection error Add a way to inject network errors by setting an immediate deadline on open consul connections. The consul client currently doesn't retry on some errors, and will force us to lose our lock. Once the consul api client is fixed, this test will fail. --- backend/remote-state/consul/backend.go | 13 ++- backend/remote-state/consul/backend_state.go | 5 + backend/remote-state/consul/client.go | 4 +- backend/remote-state/consul/client_test.go | 100 +++++++++++++++++++ 4 files changed, 116 insertions(+), 6 deletions(-) diff --git a/backend/remote-state/consul/backend.go b/backend/remote-state/consul/backend.go index d0cfa6ca4f..271a60b634 100644 --- a/backend/remote-state/consul/backend.go +++ b/backend/remote-state/consul/backend.go @@ -120,11 +120,7 @@ func (b *Backend) configure(ctx context.Context) error { config := consulapi.DefaultConfig() // replace the default Transport Dialer to reduce the KeepAlive - - config.Transport.DialContext = (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 17 * time.Second, - }).DialContext + config.Transport.DialContext = dialContext if v, ok := data.GetOk("access_token"); ok && v.(string) != "" { config.Token = v.(string) @@ -175,3 +171,10 @@ func (b *Backend) configure(ctx context.Context) error { b.client = client return nil } + +// dialContext is the DialContext function for the consul client transport. +// This is stored in a package var to inject a different dialer for tests. +var dialContext = (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 17 * time.Second, +}).DialContext diff --git a/backend/remote-state/consul/backend_state.go b/backend/remote-state/consul/backend_state.go index 8002d98303..95010aa0e6 100644 --- a/backend/remote-state/consul/backend_state.go +++ b/backend/remote-state/consul/backend_state.go @@ -85,6 +85,11 @@ func (b *Backend) State(name string) (state.State, error) { stateMgr = &state.LockDisabled{Inner: stateMgr} } + // the default state always exists + if name == backend.DefaultStateName { + return stateMgr, nil + } + // Grab a lock, we use this to write an empty state if one doesn't // exist already. We have to write an empty state as a sentinel value // so States() knows it exists. diff --git a/backend/remote-state/consul/client.go b/backend/remote-state/consul/client.go index e180210912..bd37712f3f 100644 --- a/backend/remote-state/consul/client.go +++ b/backend/remote-state/consul/client.go @@ -32,6 +32,8 @@ const ( lockReacquireInterval = 2 * time.Second ) +var lostLockErr = errors.New("consul lock was lost") + // RemoteClient is a remote client that stores data in Consul. type RemoteClient struct { Client *consulapi.Client @@ -409,7 +411,7 @@ func (c *RemoteClient) unlock(id string) error { select { case <-c.lockCh: - return errors.New("consul lock was lost") + return lostLockErr default: } diff --git a/backend/remote-state/consul/client_test.go b/backend/remote-state/consul/client_test.go index 910c5e9ebf..09e1a7d9ab 100644 --- a/backend/remote-state/consul/client_test.go +++ b/backend/remote-state/consul/client_test.go @@ -1,7 +1,10 @@ package consul import ( + "context" "fmt" + "net" + "sync" "testing" "time" @@ -188,3 +191,100 @@ func TestConsul_lostLock(t *testing.T) { t.Fatal(err) } } + +func TestConsul_lostLockConnection(t *testing.T) { + srv := newConsulTestServer(t) + defer srv.Stop() + + // create an "unreliable" network by closing all the consul client's + // network connections + conns := &unreliableConns{} + origDialFn := dialContext + defer func() { + dialContext = origDialFn + }() + dialContext = conns.DialContext + + path := fmt.Sprintf("tf-unit/%s", time.Now().String()) + + b := backend.TestBackendConfig(t, New(), map[string]interface{}{ + "address": srv.HTTPAddr, + "path": path, + }) + + s, err := b.State(backend.DefaultStateName) + if err != nil { + t.Fatal(err) + } + + info := state.NewLockInfo() + info.Operation = "test-lost-lock-connection" + id, err := s.Lock(info) + if err != nil { + t.Fatal(err) + } + + // set a callback to know when the monitor loop re-connects + dialed := make(chan struct{}) + conns.dialCallback = func() { + close(dialed) + conns.dialCallback = nil + } + + // kill any open connections + // once the consul client is fixed, we should loop over this a few time to + // be sure, since we can't hook into the client's internal lock monitor + // loop. + conns.Kill() + // wait for a new connection to be dialed, and kill it again + <-dialed + conns.Kill() + + // since the lock monitor loop is hidden in the consul api client, we can + // only wait a bit to make sure we were notified of the failure + time.Sleep(time.Second) + + // once the consul client can reconnect properly, there will no longer be + // an error here + //if err := s.Unlock(id); err != nil { + if err := s.Unlock(id); err != lostLockErr { + t.Fatalf("expected lost lock error, got %v", err) + } +} + +type unreliableConns struct { + sync.Mutex + conns []net.Conn + dialCallback func() +} + +func (u *unreliableConns) DialContext(ctx context.Context, netw, addr string) (net.Conn, error) { + u.Lock() + defer u.Unlock() + + dialer := &net.Dialer{} + conn, err := dialer.DialContext(ctx, netw, addr) + if err != nil { + return nil, err + } + + u.conns = append(u.conns, conn) + + if u.dialCallback != nil { + u.dialCallback() + } + + return conn, nil +} + +// Kill these with a deadline, just to make sure we don't end up with any EOFs +// that get ignored. +func (u *unreliableConns) Kill() { + u.Lock() + defer u.Unlock() + + for _, conn := range u.conns { + conn.(*net.TCPConn).SetDeadline(time.Now()) + } + u.conns = nil +}