Skip to content

Commit 00a9a9d

Browse files
authored
Fix deadlock on engine start (ethereum#1685)
* Fix deadlock during StartValidating StartValidating makes a call to RefreshValPeers while holding coreMu and RefreshValPeers waits for all validator peers to be deleted and then reconnects to known validators. If any of those peers has called IsValidating before RefreshValPeers tries to delete them, the system gets stuck in a deadlock because IsValidating also tries to acquire coreMu. The peer will never acquire coreMu because it is held by StartValidating, and StartValidating will never return because it is waiting for all peers to disconnect. This commit makes coreStarted into an atomic variable so that peers can make threadsafe calls to IsValidating without needing to acquire coreStarted. * Fix long wait for nodes to connect At test startup sometimes nodes were taking in the region of 30s to connect whilst other times it was happening in μs. The problem was we were trying to connect all peers to all other peers. That meant that for any two peers they would both dial each other. Sometimes if this occurred close enough in time both sides would hang up the connections (I call this cross dialing). This happens because each side counts their outgoing connection as connected and then when the incoming connection arrives they drop it because they see themselves as already connected. When this happened nodes would retry after some time probably 30s and then be connected. The fix was to ensure that for any two nodes only one of them dials the other.
1 parent 5aa1d94 commit 00a9a9d

File tree

5 files changed

+29
-19
lines changed

5 files changed

+29
-19
lines changed

consensus/istanbul/backend/api.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ func (api *API) GetCurrentRoundState() (*core.RoundStateSummary, error) {
193193
api.istanbul.coreMu.RLock()
194194
defer api.istanbul.coreMu.RUnlock()
195195

196-
if !api.istanbul.coreStarted {
196+
if !api.istanbul.isCoreStarted() {
197197
return nil, istanbul.ErrStoppedEngine
198198
}
199199
return api.istanbul.core.CurrentRoundState().Summary(), nil
@@ -203,7 +203,7 @@ func (api *API) ForceRoundChange() (bool, error) {
203203
api.istanbul.coreMu.RLock()
204204
defer api.istanbul.coreMu.RUnlock()
205205

206-
if !api.istanbul.coreStarted {
206+
if !api.istanbul.isCoreStarted() {
207207
return false, istanbul.ErrStoppedEngine
208208
}
209209
api.istanbul.core.ForceRoundChange()

consensus/istanbul/backend/backend.go

+18-5
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,15 @@ func New(config *istanbul.Config, db ethdb.Database) consensus.Istanbul {
113113
logger.Crit("Failed to create recent snapshots cache", "err", err)
114114
}
115115

116+
coreStarted := atomic.Value{}
117+
coreStarted.Store(false)
116118
backend := &Backend{
117119
config: config,
118120
istanbulEventMux: new(event.TypeMux),
119121
logger: logger,
120122
db: db,
121123
recentSnapshots: recentSnapshots,
122-
coreStarted: false,
124+
coreStarted: coreStarted,
123125
announceRunning: false,
124126
gossipCache: NewLRUGossipCache(inmemoryPeers, inmemoryMessages),
125127
announceThreadWg: new(sync.WaitGroup),
@@ -221,7 +223,16 @@ type Backend struct {
221223
validateState func(block *types.Block, statedb *state.StateDB, receipts types.Receipts, usedGas uint64) error
222224
onNewConsensusBlock func(block *types.Block, receipts []*types.Receipt, logs []*types.Log, state *state.StateDB)
223225

224-
coreStarted bool
226+
// We need this to be an atomic value so that we can access it in a lock
227+
// free way from IsValidating. This is required because StartValidating
228+
// makes a call to RefreshValPeers while holding coreMu and RefreshValPeers
229+
// waits for all validator peers to be deleted and then reconnects to known
230+
// validators. If any of those peers has called IsValidating before
231+
// RefreshValPeers tries to delete them the system gets stuck in a
232+
// deadlock, the peer will never acquire coreMu because it is held by
233+
// StartValidating, and StartValidating will never return because it is
234+
// waiting for all peers to disconnect.
235+
coreStarted atomic.Value
225236
coreMu sync.RWMutex
226237

227238
// Snapshots for recent blocks to speed up reorgs
@@ -325,6 +336,10 @@ type Backend struct {
325336
abortCommitHook func(result *istanbulCore.StateProcessResult) bool // Method to call upon committing a proposal
326337
}
327338

339+
func (sb *Backend) isCoreStarted() bool {
340+
return sb.coreStarted.Load().(bool)
341+
}
342+
328343
// IsProxy returns true if instance has proxy flag
329344
func (sb *Backend) IsProxy() bool {
330345
return sb.config.Proxy
@@ -350,9 +365,7 @@ func (sb *Backend) GetProxiedValidatorEngine() proxy.ProxiedValidatorEngine {
350365
// IsValidating return true if instance is validating
351366
func (sb *Backend) IsValidating() bool {
352367
// TODO: Maybe a little laggy, but primary / replica should track the core
353-
sb.coreMu.RLock()
354-
defer sb.coreMu.RUnlock()
355-
return sb.coreStarted
368+
return sb.isCoreStarted()
356369
}
357370

358371
// IsValidator return if instance is a validator (either proxied or standalone)

consensus/istanbul/backend/engine.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ func (sb *Backend) updateReplicaStateLoop(bc *ethCore.BlockChain) {
630630
select {
631631
case chainEvent := <-chainEventCh:
632632
sb.coreMu.RLock()
633-
if !sb.coreStarted && sb.replicaState != nil {
633+
if !sb.isCoreStarted() && sb.replicaState != nil {
634634
consensusBlock := new(big.Int).Add(chainEvent.Block.Number(), common.Big1)
635635
sb.replicaState.NewChainHead(consensusBlock)
636636
}
@@ -649,7 +649,7 @@ func (sb *Backend) SetCallBacks(hasBadBlock func(common.Hash) bool,
649649
onNewConsensusBlock func(block *types.Block, receipts []*types.Receipt, logs []*types.Log, state *state.StateDB)) error {
650650
sb.coreMu.RLock()
651651
defer sb.coreMu.RUnlock()
652-
if sb.coreStarted {
652+
if sb.isCoreStarted() {
653653
return istanbul.ErrStartedEngine
654654
}
655655

@@ -664,7 +664,7 @@ func (sb *Backend) SetCallBacks(hasBadBlock func(common.Hash) bool,
664664
func (sb *Backend) StartValidating() error {
665665
sb.coreMu.Lock()
666666
defer sb.coreMu.Unlock()
667-
if sb.coreStarted {
667+
if sb.isCoreStarted() {
668668
return istanbul.ErrStartedEngine
669669
}
670670

@@ -684,7 +684,7 @@ func (sb *Backend) StartValidating() error {
684684
sb.UpdateAnnounceVersion()
685685
}
686686

687-
sb.coreStarted = true
687+
sb.coreStarted.Store(true)
688688

689689
// coreStarted must be true by this point for validator peers to be successfully added
690690
if !sb.config.Proxied {
@@ -700,14 +700,14 @@ func (sb *Backend) StartValidating() error {
700700
func (sb *Backend) StopValidating() error {
701701
sb.coreMu.Lock()
702702
defer sb.coreMu.Unlock()
703-
if !sb.coreStarted {
703+
if !sb.isCoreStarted() {
704704
return istanbul.ErrStoppedEngine
705705
}
706706
sb.logger.Info("Stopping istanbul.Engine validating")
707707
if err := sb.core.Stop(); err != nil {
708708
return err
709709
}
710-
sb.coreStarted = false
710+
sb.coreStarted.Store(false)
711711

712712
return nil
713713
}

consensus/istanbul/backend/handler.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ func (sb *Backend) NewWork() error {
203203

204204
sb.coreMu.RLock()
205205
defer sb.coreMu.RUnlock()
206-
if !sb.coreStarted {
206+
if !sb.isCoreStarted() {
207207
return istanbul.ErrStoppedEngine
208208
}
209209

test/node.go

+2-5
Original file line numberDiff line numberDiff line change
@@ -396,12 +396,9 @@ func NewNetwork(accounts *env.AccountsConfig, gc *genesis.Config, ec *eth.Config
396396
// each other nodes don't start sending consensus messages to another node
397397
// until they have received an enode certificate from that node.
398398
for i, en := range enodes {
399-
for j, n := range network {
400-
if j == i {
401-
continue
402-
}
399+
// Connect to the remaining nodes
400+
for _, n := range network[i+1:] {
403401
n.Server().AddPeer(en, p2p.ValidatorPurpose)
404-
n.Server().AddTrustedPeer(en, p2p.ValidatorPurpose)
405402
}
406403
}
407404

0 commit comments

Comments
 (0)