Skip to content

Commit 9e39755

Browse files
committed
rearrange jl_delete_thread to be thread-safe (#56097)
Prior to this, especially on macOS, the gc-safepoint here would cause the process to segfault as we had already freed the current_task state. Rearrange this code so that the GC interactions (except for the atomic store to current_task) are all handled before entering GC safe, and then signaling the thread is deleted (via setting current_task = NULL, published by jl_unlock_profile_wr to other threads) is last. ``` ERROR: Exception handler triggered on unmanaged thread. Process 53827 stopped * thread #5, stop reason = EXC_BAD_ACCESS (code=2, address=0x100018008) frame #0: 0x0000000100b74344 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_set(ptls=0x000000011f8b3200, state='\x02', old_state=<unavailable>) at julia_threads.h:272:9 [opt] 269 assert(old_state != JL_GC_CONCURRENT_COLLECTOR_THREAD); 270 jl_atomic_store_release(&ptls->gc_state, state); 271 if (state == JL_GC_STATE_UNSAFE || old_state == JL_GC_STATE_UNSAFE) -> 272 jl_gc_safepoint_(ptls); 273 return old_state; 274 } 275 STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, Target 0: (julia) stopped. (lldb) up frame #1: 0x0000000100b74320 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_save_and_set(ptls=0x000000011f8b3200, state='\x02') at julia_threads.h:278:12 [opt] 275 STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, 276 int8_t state) 277 { -> 278 return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state)); 279 } 280 #ifdef __clang_gcanalyzer__ 281 // these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically) (lldb) frame #2: 0x0000000100b7431c libjulia-internal.1.12.0.dylib`jl_delete_thread(value=0x000000011f8b3200) at threading.c:537:11 [opt] 534 ptls->root_task = NULL; 535 jl_free_thread_gc_state(ptls); 536 // then park in safe-region -> 537 (void)jl_gc_safe_enter(ptls); 538 } ``` (test incorporated into #55793) (cherry picked from commit 0d09f3d, resolving conflicts from not having backported #52198)
1 parent 6e28217 commit 9e39755

File tree

3 files changed

+28
-21
lines changed

3 files changed

+28
-21
lines changed

src/gc-stacks.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
4343
}
4444

4545

46-
static void free_stack(void *stkbuf, size_t bufsz)
46+
static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
4747
{
4848
VirtualFree(stkbuf, 0, MEM_RELEASE);
4949
jl_atomic_fetch_add(&num_stack_mappings, -1);
@@ -68,7 +68,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
6868
return stk;
6969
}
7070

71-
static void free_stack(void *stkbuf, size_t bufsz)
71+
static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
7272
{
7373
munmap(stkbuf, bufsz);
7474
jl_atomic_fetch_add(&num_stack_mappings, -1);
@@ -110,7 +110,7 @@ static unsigned select_pool(size_t nb) JL_NOTSAFEPOINT
110110
}
111111

112112

113-
static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz)
113+
static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
114114
{
115115
#ifdef _COMPILER_ASAN_ENABLED_
116116
__asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);

src/julia.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1043,7 +1043,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void);
10431043
JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void);
10441044
JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz);
10451045
JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, struct _jl_task_t *owner) JL_NOTSAFEPOINT;
1046-
JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz);
1046+
JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
10471047
JL_DLLEXPORT void jl_gc_use(jl_value_t *a);
10481048
// Set GC memory trigger in bytes for greedy memory collecting
10491049
JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);

src/threading.c

+24-17
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,29 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
443443
// prior unsafe-region (before we let it release the stack memory)
444444
(void)jl_gc_unsafe_enter(ptls);
445445
scheduler_delete_thread(ptls);
446+
// need to clear pgcstack and eh, but we can clear everything now too
447+
jl_task_t *ct = jl_atomic_load_relaxed(&ptls->current_task);
448+
jl_task_frame_noreturn(ct);
449+
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
450+
// the system will probably free this stack memory soon
451+
// so prevent any other thread from accessing it later
452+
if (ct != ptls->root_task)
453+
jl_task_frame_noreturn(ptls->root_task);
454+
}
455+
else {
456+
// Uh oh. The user cleared the sticky bit so it started running
457+
// elsewhere, then called pthread_exit on this thread from another
458+
// Task, which will free the stack memory of that root task soon. This
459+
// is not recoverable. Though we could just hang here, a fatal message
460+
// is likely better.
461+
jl_safe_printf("fatal: thread exited from wrong Task.\n");
462+
abort();
463+
}
464+
ptls->previous_exception = NULL;
465+
// allow the page root_task is on to be freed
466+
ptls->root_task = NULL;
467+
// park in safe-region from here on (this may run GC again)
468+
(void)jl_gc_safe_enter(ptls);
446469
// try to free some state we do not need anymore
447470
#ifndef _OS_WINDOWS_
448471
void *signal_stack = ptls->signal_stack;
@@ -481,21 +504,7 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
481504
#else
482505
pthread_mutex_lock(&in_signal_lock);
483506
#endif
484-
// need to clear pgcstack and eh, but we can clear everything now too
485-
jl_task_frame_noreturn(jl_atomic_load_relaxed(&ptls->current_task));
486-
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
487-
// the system will probably free this stack memory soon
488-
// so prevent any other thread from accessing it later
489-
jl_task_frame_noreturn(ptls->root_task);
490-
}
491-
else {
492-
// Uh oh. The user cleared the sticky bit so it started running
493-
// elsewhere, then called pthread_exit on this thread. This is not
494-
// recoverable. Though we could just hang here, a fatal message is better.
495-
jl_safe_printf("fatal: thread exited from wrong Task.\n");
496-
abort();
497-
}
498-
jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead
507+
jl_atomic_store_relaxed(&ptls->current_task, NULL); // indicate dead
499508
// finally, release all of the locks we had grabbed
500509
#ifdef _OS_WINDOWS_
501510
jl_unlock_profile_wr();
@@ -506,8 +515,6 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
506515
#else
507516
pthread_mutex_unlock(&in_signal_lock);
508517
#endif
509-
// then park in safe-region
510-
(void)jl_gc_safe_enter(ptls);
511518
}
512519

513520
//// debugging hack: if we are exiting too fast for error message printing on threads,

0 commit comments

Comments
 (0)