fixed data race in core.Connection.Close(). 61/35861/2
authorSergey Elantsev <elantsev.s@yandex.ru>
Fri, 1 Apr 2022 19:43:40 +0000 (22:43 +0300)
committerOndrej Fabry <ofabry@cisco.com>
Thu, 5 May 2022 01:29:15 +0000 (01:29 +0000)
The problem is triggered by the following events:
1. VPP stops.
2. core.Connection.healthCheckLoop() detects it and calls disconnectVPP(),
   which does close healthCheckDone channel for the first time.
   At this point Connection becomes unusable - re-entrance to
   connectLoop() will return immediately because of a closed
   healthCheckDone channel.
   But, the race is that connection may set c.vppConnected = 1
   before terminating the connect loop.
3. User calls Connection.Close(), who seed c.vppConnected = 1,
   and tries to close already closed channel, which leads to a panic.

This commit fixes this race by telling disconnectVPP() whether
it is a terminal stop triggered via Close(), or a temporary one
from connectLoop().

Change-Id: I555149da35ca3dc2b606bad59f2101266c0ef6b9
Signed-off-by: Sergey Elantsev <elantsev.s@yandex.ru>
core/connection.go
core/connection_test.go

index 1bfcae5..f796f3d 100644 (file)
@@ -216,14 +216,18 @@ func (c *Connection) Disconnect() {
                return
        }
        if c.vppClient != nil {
-               c.disconnectVPP()
+               c.disconnectVPP(true)
        }
 }
 
-// disconnectVPP disconnects from VPP in case it is connected.
-func (c *Connection) disconnectVPP() {
+// disconnectVPP disconnects from VPP in case it is connected. terminate tells
+// that disconnectVPP() was called from Close(), so healthCheckLoop() can be
+// terminated.
+func (c *Connection) disconnectVPP(terminate bool) {
        if atomic.CompareAndSwapUint32(&c.vppConnected, 1, 0) {
-               close(c.healthCheckDone)
+               if terminate {
+                       close(c.healthCheckDone)
+               }
                log.Debug("Disconnecting from VPP..")
 
                if err := c.vppClient.Disconnect(); err != nil {
@@ -383,7 +387,7 @@ HealthCheck:
        }
 
        // cleanup
-       c.disconnectVPP()
+       c.disconnectVPP(false)
 
        // we are now disconnected, start connect loop
        c.connectLoop()
index 230eea5..81b145c 100644 (file)
@@ -16,6 +16,7 @@ package core_test
 
 import (
        "testing"
+       "time"
 
        . "github.com/onsi/gomega"
 
@@ -91,6 +92,28 @@ func TestAsyncConnection(t *testing.T) {
        Expect(ev.State).Should(BeEquivalentTo(core.Connected))
 }
 
+func TestAsyncConnectionProcessesVppTimeout(t *testing.T) {
+       ctx := setupTest(t, false)
+       defer ctx.teardownTest()
+
+       ctx.conn.Disconnect()
+       conn, statusChan, err := core.AsyncConnect(ctx.mockVpp, core.DefaultMaxReconnectAttempts, core.DefaultReconnectInterval)
+       ctx.conn = conn
+
+       Expect(err).ShouldNot(HaveOccurred())
+       Expect(conn).ShouldNot(BeNil())
+
+       ev := <-statusChan
+       Expect(ev.State).Should(BeEquivalentTo(core.Connected))
+
+       // make control ping reply fail so that connection.healthCheckLoop()
+       // initiates reconnection.
+       ctx.mockVpp.MockReply(&vpe.ControlPingReply{
+               Retval: -1,
+       })
+       time.Sleep(time.Duration(1+core.HealthCheckThreshold) * (core.HealthCheckInterval + 2*core.HealthCheckReplyTimeout))
+}
+
 func TestCodec(t *testing.T) {
        RegisterTestingT(t)