@@ -113,8 +113,9 @@ const (
113
113
maxNoRTTPingBeforeFirstPong = 2 * time .Second
114
114
115
115
// For stalling fast producers
116
- stallClientMinDuration = 100 * time .Millisecond
117
- stallClientMaxDuration = time .Second
116
+ stallClientMinDuration = 2 * time .Millisecond
117
+ stallClientMaxDuration = 5 * time .Millisecond
118
+ stallTotalAllowed = 10 * time .Millisecond
118
119
)
119
120
120
121
var readLoopReportThreshold = readLoopReport
@@ -462,6 +463,9 @@ type readCache struct {
462
463
463
464
// Capture the time we started processing our readLoop.
464
465
start time.Time
466
+
467
+ // Total time stalled so far for readLoop processing.
468
+ tst time.Duration
465
469
}
466
470
467
471
// set the flag (would be equivalent to set the boolean to true)
@@ -1414,6 +1418,11 @@ func (c *client) readLoop(pre []byte) {
1414
1418
}
1415
1419
return
1416
1420
}
1421
+ // Clear total stalled time here.
1422
+ if c .in .tst >= stallClientMaxDuration {
1423
+ c .rateLimitFormatWarnf ("Producer was stalled for a total of %v" , c .in .tst .Round (time .Millisecond ))
1424
+ }
1425
+ c .in .tst = 0
1417
1426
}
1418
1427
1419
1428
// If we are a ROUTER/LEAF and have processed an INFO, it is possible that
@@ -1730,7 +1739,7 @@ func (c *client) flushOutbound() bool {
1730
1739
1731
1740
// Check if we have a stalled gate and if so and we are recovering release
1732
1741
// any stalled producers. Only kind==CLIENT will stall.
1733
- if c .out .stc != nil && (n == attempted || c .out .pb < c .out .mp / 2 ) {
1742
+ if c .out .stc != nil && (n == attempted || c .out .pb < c .out .mp / 4 * 3 ) {
1734
1743
close (c .out .stc )
1735
1744
c .out .stc = nil
1736
1745
}
@@ -2292,7 +2301,8 @@ func (c *client) queueOutbound(data []byte) {
2292
2301
// Check here if we should create a stall channel if we are falling behind.
2293
2302
// We do this here since if we wait for consumer's writeLoop it could be
2294
2303
// too late with large number of fan in producers.
2295
- if c .out .pb > c .out .mp / 2 && c .out .stc == nil {
2304
+ // If the outbound connection is > 75% of maximum pending allowed, create a stall gate.
2305
+ if c .out .pb > c .out .mp / 4 * 3 && c .out .stc == nil {
2296
2306
c .out .stc = make (chan struct {})
2297
2307
}
2298
2308
}
@@ -3337,31 +3347,36 @@ func (c *client) msgHeader(subj, reply []byte, sub *subscription) []byte {
3337
3347
}
3338
3348
3339
3349
func (c * client ) stalledWait (producer * client ) {
3350
+ // Check to see if we have exceeded our total wait time per readLoop invocation.
3351
+ if producer .in .tst > stallTotalAllowed {
3352
+ return
3353
+ }
3354
+
3355
+ // Grab stall channel which the slow consumer will close when caught up.
3340
3356
stall := c .out .stc
3341
- ttl := stallDuration ( c . out . pb , c . out . mp )
3357
+
3342
3358
c .mu .Unlock ()
3343
3359
defer c .mu .Lock ()
3344
3360
3361
+ // Calculate stall time.
3362
+ ttl := stallClientMinDuration
3363
+ if c .out .pb >= c .out .mp {
3364
+ ttl = stallClientMaxDuration
3365
+ }
3366
+ // Now check if we are close to total allowed.
3367
+ if producer .in .tst + ttl > stallTotalAllowed {
3368
+ ttl = stallTotalAllowed - producer .in .tst
3369
+ }
3345
3370
delay := time .NewTimer (ttl )
3346
3371
defer delay .Stop ()
3347
3372
3373
+ start := time .Now ()
3348
3374
select {
3349
3375
case <- stall :
3350
3376
case <- delay .C :
3351
3377
producer .Debugf ("Timed out of fast producer stall (%v)" , ttl )
3352
3378
}
3353
- }
3354
-
3355
- func stallDuration (pb , mp int64 ) time.Duration {
3356
- ttl := stallClientMinDuration
3357
- if pb >= mp {
3358
- ttl = stallClientMaxDuration
3359
- } else if hmp := mp / 2 ; pb > hmp {
3360
- bsz := hmp / 10
3361
- additional := int64 (ttl ) * ((pb - hmp ) / bsz )
3362
- ttl += time .Duration (additional )
3363
- }
3364
- return ttl
3379
+ producer .in .tst += time .Since (start )
3365
3380
}
3366
3381
3367
3382
// Used to treat maps as efficient set
0 commit comments