Skip to content

Commit b5337ce

Browse files
authored
No more sending on closed channels during checkscontroller shutdown (#245)
* feat: added metrics to targetmanager First metric announces registration state. * test: added metrics to the tests * fix: race condition when shutting down checks When teh sparrow check controller would shut down, it would iterate over the actual slice of the checks, shutdown each check and then procceed to delete said check from the slice. Since the shutting down procedure is instant, there was a race condition that would delete a wrong check from the slice and then the same shutting down loop would try and shutdown the same check again. Just returning a copy of the slice resolves this problem, as the iteration is now done on the copy only. A more sophisticated deletion routine for the checks slice could be another way to handle this, but it would probably increase the complexity of the checks and checkscontroller structs. * chore: shutdown message on end * test: added validating test case Signed-off-by: Bruno Bressi <[email protected]> * chore: marked function as helper * test: added test for the controller's update function This test proves that the shallow clone works as intended and returns a clone of the slice, where the original references can still be used and updated. * chore: bumped golangci lint to latest version This should fix the bodyclose linting remarks Signed-off-by: Bruno Bressi <[email protected]> --------- Signed-off-by: Bruno Bressi <[email protected]>
1 parent 9019962 commit b5337ce

File tree

8 files changed

+162
-9
lines changed

8 files changed

+162
-9
lines changed

.github/workflows/pre-commit.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717

1818
- name: Install go toolchain for pre-commit
1919
run: |
20-
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.60.3
20+
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.63.4
2121
go install mvdan.cc/gofumpt@latest
2222
go install github.com/matryer/moq@latest
2323
go install github.com/norwoodj/helm-docs/cmd/helm-docs@latest

cmd/run.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -86,17 +86,18 @@ func run() func(cmd *cobra.Command, args []string) error {
8686

8787
s := sparrow.New(cfg)
8888
cErr := make(chan error, 1)
89-
log.Info("Running sparrow")
89+
log.InfoContext(ctx, "Running sparrow")
9090
go func() {
9191
cErr <- s.Run(ctx)
9292
}()
9393

9494
select {
9595
case <-sigChan:
96-
log.Info("Signal received, shutting down")
96+
log.InfoContext(ctx, "Signal received, shutting down")
9797
cancel()
9898
<-cErr
99-
case err := <-cErr:
99+
case err = <-cErr:
100+
log.InfoContext(ctx, "Sparrow was shut down")
100101
return err
101102
}
102103

pkg/api/api_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func TestAPI_Run(t *testing.T) {
7777
rec := httptest.NewRecorder()
7878
a.router.ServeHTTP(rec, req)
7979

80-
if status := rec.Result().StatusCode; status != tt.want.status { //nolint:bodyclose // closed in defer below
80+
if status := rec.Result().StatusCode; status != tt.want.status {
8181
t.Errorf("Handler for route %s returned wrong status code: got %v want %v", tt.want.path, status, tt.want.status)
8282
}
8383

pkg/checks/runtime/checks.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package runtime
2020

2121
import (
22+
"slices"
2223
"sync"
2324

2425
"github.com/telekom/sparrow/pkg/checks"
@@ -53,5 +54,5 @@ func (c *Checks) Delete(check checks.Check) {
5354
func (c *Checks) Iter() []checks.Check {
5455
c.mu.RLock()
5556
defer c.mu.RUnlock()
56-
return c.checks
57+
return slices.Clone(c.checks)
5758
}

pkg/sparrow/controller_test.go

+147
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,58 @@ func TestRun_ContextCancellation(t *testing.T) {
103103
}
104104
}
105105

106+
// TestChecksController_Shutdown tests the shutdown of the ChecksController
107+
// when none, one or multiple checks are registered. The test checks that after shutdown no
108+
// checks are registered anymore (the checks slice is empty) and that the done channel is closed.
109+
func TestChecksController_Shutdown(t *testing.T) {
110+
tests := []struct {
111+
name string
112+
checks []checks.Check
113+
}{
114+
{
115+
name: "no checks registered",
116+
checks: nil,
117+
},
118+
{
119+
name: "one check registered",
120+
checks: []checks.Check{newMockCheck(t, "mockCheck")},
121+
},
122+
{
123+
name: "multiple checks registered",
124+
checks: []checks.Check{
125+
newMockCheck(t, "mockCheck1"),
126+
newMockCheck(t, "mockCheck2"),
127+
newMockCheck(t, "mockCheck3"),
128+
newMockCheck(t, "mockCheck4"),
129+
},
130+
},
131+
}
132+
133+
for _, tt := range tests {
134+
t.Run(tt.name, func(t *testing.T) {
135+
cc := NewChecksController(db.NewInMemory(), metrics.New(metrics.Config{}))
136+
137+
if tt.checks != nil {
138+
for _, check := range tt.checks {
139+
cc.RegisterCheck(context.Background(), check)
140+
}
141+
}
142+
143+
cc.Shutdown(context.Background())
144+
145+
select {
146+
case <-cc.done:
147+
if len(cc.checks.Iter()) != 0 {
148+
t.Errorf("Expected no checks to be registered")
149+
}
150+
return
151+
case <-time.After(time.Second):
152+
t.Fatal("Expected done channel to be closed")
153+
}
154+
})
155+
}
156+
}
157+
106158
func TestChecksController_Reconcile(t *testing.T) {
107159
ctx, cancel := logger.NewContextWithLogger(context.Background())
108160
defer cancel()
@@ -235,6 +287,74 @@ func TestChecksController_Reconcile(t *testing.T) {
235287
}
236288
}
237289

290+
// TestChecksController_Reconcile_Update tests the update of the checks
291+
// when the runtime configuration changes.
292+
func TestChecksController_Reconcile_Update(t *testing.T) {
293+
ctx, cancel := logger.NewContextWithLogger(context.Background())
294+
defer cancel()
295+
296+
tests := []struct {
297+
name string
298+
checks []checks.Check
299+
newRuntimeConfig runtime.Config
300+
}{
301+
{
302+
name: "update health check",
303+
checks: []checks.Check{
304+
health.NewCheck(),
305+
},
306+
newRuntimeConfig: runtime.Config{
307+
Health: &health.Config{
308+
Targets: []string{"https://new.com"},
309+
Interval: 200 * time.Millisecond,
310+
Timeout: 1000 * time.Millisecond,
311+
},
312+
},
313+
},
314+
{
315+
name: "update health & latency check",
316+
checks: []checks.Check{
317+
health.NewCheck(),
318+
latency.NewCheck(),
319+
},
320+
newRuntimeConfig: runtime.Config{
321+
Health: &health.Config{
322+
Targets: []string{"https://new.com"},
323+
Interval: 200 * time.Millisecond,
324+
Timeout: 1000 * time.Millisecond,
325+
},
326+
Latency: &latency.Config{
327+
Targets: []string{"https://new.com"},
328+
Interval: 200 * time.Millisecond,
329+
Timeout: 1000 * time.Millisecond,
330+
},
331+
},
332+
},
333+
}
334+
335+
for _, tt := range tests {
336+
t.Run(tt.name, func(t *testing.T) {
337+
cc := NewChecksController(db.NewInMemory(), metrics.New(metrics.Config{}))
338+
for _, c := range tt.checks {
339+
cc.checks.Add(c)
340+
}
341+
342+
cc.Reconcile(ctx, tt.newRuntimeConfig)
343+
344+
for _, c := range cc.checks.Iter() {
345+
switch c.GetConfig().For() {
346+
case health.CheckName:
347+
hc := c.(*health.Health)
348+
assert.Equal(t, tt.newRuntimeConfig.Health.Targets, hc.GetConfig().(*health.Config).Targets)
349+
case latency.CheckName:
350+
lc := c.(*latency.Latency)
351+
assert.Equal(t, tt.newRuntimeConfig.Latency.Targets, lc.GetConfig().(*latency.Config).Targets)
352+
}
353+
}
354+
})
355+
}
356+
}
357+
238358
func TestChecksController_RegisterCheck(t *testing.T) {
239359
tests := []struct {
240360
name string
@@ -379,3 +499,30 @@ func TestGenerateCheckSpecs(t *testing.T) {
379499
})
380500
}
381501
}
502+
503+
// newMockCheck creates a new mock check with the given name.
504+
func newMockCheck(t *testing.T, name string) *checks.CheckMock {
505+
t.Helper()
506+
return &checks.CheckMock{
507+
GetMetricCollectorsFunc: func() []prometheus.Collector {
508+
return []prometheus.Collector{
509+
prometheus.NewCounter(prometheus.CounterOpts{
510+
Name: fmt.Sprintf("%s_mock_metric", name),
511+
}),
512+
}
513+
},
514+
NameFunc: func() string {
515+
return name
516+
},
517+
RemoveLabelledMetricsFunc: nil,
518+
RunFunc: func(ctx context.Context, cResult chan checks.ResultDTO) error {
519+
t.Logf("Run called for check %s", name)
520+
return nil
521+
},
522+
SchemaFunc: nil,
523+
ShutdownFunc: func() {
524+
t.Logf("Shutdown called for check %s", name)
525+
},
526+
UpdateConfigFunc: nil,
527+
}
528+
}

pkg/sparrow/handlers_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ func TestSparrow_handleCheckMetrics(t *testing.T) {
129129
}
130130

131131
s.handleCheckMetrics(w, r)
132-
resp := w.Result() //nolint:bodyclose
132+
resp := w.Result()
133133
body, _ := io.ReadAll(resp.Body)
134134

135135
if tt.wantCode == http.StatusOK {

pkg/sparrow/run.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ func (s *Sparrow) Run(ctx context.Context) error {
131131
s.shutdown(ctx)
132132
}
133133
case <-s.cDone:
134+
log.InfoContext(ctx, "Sparrow was shut down")
134135
return fmt.Errorf("sparrow was shut down")
135136
}
136137
}
@@ -181,7 +182,7 @@ func (s *Sparrow) shutdown(ctx context.Context) {
181182
defer cancel()
182183

183184
s.shutOnce.Do(func() {
184-
log.Info("Shutting down sparrow gracefully")
185+
log.InfoContext(ctx, "Shutting down sparrow")
185186
var sErrs ErrShutdown
186187
if s.tarMan != nil {
187188
sErrs.errTarMan = s.tarMan.Shutdown(ctx)
@@ -192,7 +193,7 @@ func (s *Sparrow) shutdown(ctx context.Context) {
192193
s.controller.Shutdown(ctx)
193194

194195
if sErrs.HasError() {
195-
log.Error("Failed to shutdown gracefully", "contextError", errC, "errors", sErrs)
196+
log.ErrorContext(ctx, "Failed to shutdown gracefully", "contextError", errC, "errors", sErrs)
196197
}
197198

198199
// Signal that shutdown is complete

pkg/sparrow/run_errors.go

+3
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,15 @@
1818

1919
package sparrow
2020

21+
// ErrShutdown holds any errors that may
22+
// have occurred during shutdown of the Sparrow
2123
type ErrShutdown struct {
2224
errAPI error
2325
errTarMan error
2426
errMetrics error
2527
}
2628

29+
// HasError returns true if any of the errors are set
2730
func (e ErrShutdown) HasError() bool {
2831
return e.errAPI != nil || e.errTarMan != nil || e.errMetrics != nil
2932
}

0 commit comments

Comments
 (0)