Skip to content

Commit ed7511a

Browse files
committed
rearrange jl_delete_thread to be thread-safe
Prior to this, especially on macOS, the gc-safepoint here would cause the process to segfault as we had already freed the current_task state. Rearrange this code so that the GC interactions (except for the atomic store to current_task) are all handled before entering GC safe, and then signaling the thread is deleted (via setting current_task = NULL, published by jl_unlock_profile_wr to other threads) is last. ``` ERROR: Exception handler triggered on unmanaged thread. Process 53827 stopped * thread #5, stop reason = EXC_BAD_ACCESS (code=2, address=0x100018008) frame #0: 0x0000000100b74344 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_set(ptls=0x000000011f8b3200, state='\x02', old_state=<unavailable>) at julia_threads.h:272:9 [opt] 269 assert(old_state != JL_GC_CONCURRENT_COLLECTOR_THREAD); 270 jl_atomic_store_release(&ptls->gc_state, state); 271 if (state == JL_GC_STATE_UNSAFE || old_state == JL_GC_STATE_UNSAFE) -> 272 jl_gc_safepoint_(ptls); 273 return old_state; 274 } 275 STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, Target 0: (julia) stopped. (lldb) up frame #1: 0x0000000100b74320 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_save_and_set(ptls=0x000000011f8b3200, state='\x02') at julia_threads.h:278:12 [opt] 275 STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, 276 int8_t state) 277 { -> 278 return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state)); 279 } 280 #ifdef __clang_gcanalyzer__ 281 // these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically) (lldb) frame #2: 0x0000000100b7431c libjulia-internal.1.12.0.dylib`jl_delete_thread(value=0x000000011f8b3200) at threading.c:537:11 [opt] 534 ptls->root_task = NULL; 535 jl_free_thread_gc_state(ptls); 536 // then park in safe-region -> 537 (void)jl_gc_safe_enter(ptls); 538 } ```
1 parent ce97843 commit ed7511a

File tree

2 files changed

+27
-21
lines changed

2 files changed

+27
-21
lines changed

src/threading.c

+25-21
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,30 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
464464
// prior unsafe-region (before we let it release the stack memory)
465465
(void)jl_gc_unsafe_enter(ptls);
466466
scheduler_delete_thread(ptls);
467+
// need to clear pgcstack and eh, but we can clear everything now too
468+
jl_task_t *ct = jl_atomic_load_relaxed(&ptls->current_task);
469+
jl_task_frame_noreturn(ct);
470+
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
471+
// the system will probably free this stack memory soon
472+
// so prevent any other thread from accessing it later
473+
if (ct != ptls->root_task)
474+
jl_task_frame_noreturn(ptls->root_task);
475+
}
476+
else {
477+
// Uh oh. The user cleared the sticky bit so it started running
478+
// elsewhere, then called pthread_exit on this thread from another
479+
// Task, which will free the stack memory of that root task soon. This
480+
// is not recoverable. Though we could just hang here, a fatal message
481+
// is likely better.
482+
jl_safe_printf("fatal: thread exited from wrong Task.\n");
483+
abort();
484+
}
485+
ptls->previous_exception = NULL;
486+
// allow the page root_task is on to be freed
487+
ptls->root_task = NULL;
488+
jl_free_thread_gc_state(ptls);
489+
// park in safe-region from here on (this may run GC again)
490+
(void)jl_gc_safe_enter(ptls);
467491
// try to free some state we do not need anymore
468492
#ifndef _OS_WINDOWS_
469493
void *signal_stack = ptls->signal_stack;
@@ -502,21 +526,7 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
502526
#else
503527
pthread_mutex_lock(&in_signal_lock);
504528
#endif
505-
// need to clear pgcstack and eh, but we can clear everything now too
506-
jl_task_frame_noreturn(jl_atomic_load_relaxed(&ptls->current_task));
507-
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
508-
// the system will probably free this stack memory soon
509-
// so prevent any other thread from accessing it later
510-
jl_task_frame_noreturn(ptls->root_task);
511-
}
512-
else {
513-
// Uh oh. The user cleared the sticky bit so it started running
514-
// elsewhere, then called pthread_exit on this thread. This is not
515-
// recoverable. Though we could just hang here, a fatal message is better.
516-
jl_safe_printf("fatal: thread exited from wrong Task.\n");
517-
abort();
518-
}
519-
jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead
529+
jl_atomic_store_relaxed(&ptls->current_task, NULL); // indicate dead
520530
// finally, release all of the locks we had grabbed
521531
#ifdef _OS_WINDOWS_
522532
jl_unlock_profile_wr();
@@ -529,12 +539,6 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
529539
#endif
530540
free(ptls->bt_data);
531541
small_arraylist_free(&ptls->locks);
532-
ptls->previous_exception = NULL;
533-
// allow the page root_task is on to be freed
534-
ptls->root_task = NULL;
535-
jl_free_thread_gc_state(ptls);
536-
// then park in safe-region
537-
(void)jl_gc_safe_enter(ptls);
538542
}
539543

540544
//// debugging hack: if we are exiting too fast for error message printing on threads,

test/threads.jl

+2
Original file line numberDiff line numberDiff line change
@@ -407,9 +407,11 @@ let e = Base.Event(true),
407407
onces = Vector{Vector{Nothing}}(undef, length(tids))
408408
for i = 1:length(tids)
409409
function cl()
410+
GC.gc(false) # stress test the GC-safepoint mechanics of jl_adopt_thread
410411
local y = once()
411412
onces[i] = y
412413
@test x !== y === once()
414+
GC.gc(false) # stress test the GC-safepoint mechanics of jl_delete_thread
413415
nothing
414416
end
415417
function threadcallclosure(cl::F) where {F} # create sparam so we can reference the type of cl in the ccall type

0 commit comments

Comments
 (0)