Skip to content

Commit fa0ca78

Browse files
committed
Auto merge of rust-lang#102655 - joboet:windows_tls_opt, r=ChrisDenton
Optimize TLS on Windows This implements the suggestion in the current TLS code to embed the linked list of destructors in the `StaticKey` structure to save allocations. Additionally, locking is avoided when no destructor needs to be run. By using one Windows-provided `Once` per key instead of a global lock, locking is more finely-grained (this unblocks rust-lang#100579).
2 parents 3cf5fc5 + d457801 commit fa0ca78

File tree

9 files changed

+202
-117
lines changed

9 files changed

+202
-117
lines changed

library/std/src/sys/sgx/thread_local_key.rs

-5
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,3 @@ pub unsafe fn get(key: Key) -> *mut u8 {
2121
pub unsafe fn destroy(key: Key) {
2222
Tls::destroy(AbiKey::from_usize(key))
2323
}
24-
25-
#[inline]
26-
pub fn requires_synchronized_create() -> bool {
27-
false
28-
}

library/std/src/sys/solid/thread_local_key.rs

-5
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,3 @@ pub unsafe fn get(_key: Key) -> *mut u8 {
1919
pub unsafe fn destroy(_key: Key) {
2020
panic!("should not be used on the solid target");
2121
}
22-
23-
#[inline]
24-
pub fn requires_synchronized_create() -> bool {
25-
panic!("should not be used on the solid target");
26-
}

library/std/src/sys/unix/thread_local_key.rs

-5
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,3 @@ pub unsafe fn destroy(key: Key) {
2727
let r = libc::pthread_key_delete(key);
2828
debug_assert_eq!(r, 0);
2929
}
30-
31-
#[inline]
32-
pub fn requires_synchronized_create() -> bool {
33-
false
34-
}

library/std/src/sys/unsupported/thread_local_key.rs

-5
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,3 @@ pub unsafe fn get(_key: Key) -> *mut u8 {
1919
pub unsafe fn destroy(_key: Key) {
2020
panic!("should not be used on this target");
2121
}
22-
23-
#[inline]
24-
pub fn requires_synchronized_create() -> bool {
25-
panic!("should not be used on this target");
26-
}

library/std/src/sys/windows/c.rs

+17
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ pub type BCRYPT_ALG_HANDLE = LPVOID;
7171
pub type PCONDITION_VARIABLE = *mut CONDITION_VARIABLE;
7272
pub type PLARGE_INTEGER = *mut c_longlong;
7373
pub type PSRWLOCK = *mut SRWLOCK;
74+
pub type LPINIT_ONCE = *mut INIT_ONCE;
7475

7576
pub type SOCKET = crate::os::windows::raw::SOCKET;
7677
pub type socklen_t = c_int;
@@ -194,6 +195,9 @@ pub const DUPLICATE_SAME_ACCESS: DWORD = 0x00000002;
194195

195196
pub const CONDITION_VARIABLE_INIT: CONDITION_VARIABLE = CONDITION_VARIABLE { ptr: ptr::null_mut() };
196197
pub const SRWLOCK_INIT: SRWLOCK = SRWLOCK { ptr: ptr::null_mut() };
198+
pub const INIT_ONCE_STATIC_INIT: INIT_ONCE = INIT_ONCE { ptr: ptr::null_mut() };
199+
200+
pub const INIT_ONCE_INIT_FAILED: DWORD = 0x00000004;
197201

198202
pub const DETACHED_PROCESS: DWORD = 0x00000008;
199203
pub const CREATE_NEW_PROCESS_GROUP: DWORD = 0x00000200;
@@ -565,6 +569,10 @@ pub struct CONDITION_VARIABLE {
565569
pub struct SRWLOCK {
566570
pub ptr: LPVOID,
567571
}
572+
#[repr(C)]
573+
pub struct INIT_ONCE {
574+
pub ptr: LPVOID,
575+
}
568576

569577
#[repr(C)]
570578
pub struct REPARSE_MOUNTPOINT_DATA_BUFFER {
@@ -955,6 +963,7 @@ extern "system" {
955963
pub fn TlsAlloc() -> DWORD;
956964
pub fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
957965
pub fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
966+
pub fn TlsFree(dwTlsIndex: DWORD) -> BOOL;
958967
pub fn GetLastError() -> DWORD;
959968
pub fn QueryPerformanceFrequency(lpFrequency: *mut LARGE_INTEGER) -> BOOL;
960969
pub fn QueryPerformanceCounter(lpPerformanceCount: *mut LARGE_INTEGER) -> BOOL;
@@ -1114,6 +1123,14 @@ extern "system" {
11141123
pub fn TryAcquireSRWLockExclusive(SRWLock: PSRWLOCK) -> BOOLEAN;
11151124
pub fn TryAcquireSRWLockShared(SRWLock: PSRWLOCK) -> BOOLEAN;
11161125

1126+
pub fn InitOnceBeginInitialize(
1127+
lpInitOnce: LPINIT_ONCE,
1128+
dwFlags: DWORD,
1129+
fPending: LPBOOL,
1130+
lpContext: *mut LPVOID,
1131+
) -> BOOL;
1132+
pub fn InitOnceComplete(lpInitOnce: LPINIT_ONCE, dwFlags: DWORD, lpContext: LPVOID) -> BOOL;
1133+
11171134
pub fn CompareStringOrdinal(
11181135
lpString1: LPCWSTR,
11191136
cchCount1: c_int,

library/std/src/sys/windows/thread_local_key.rs

+123-73
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1-
use crate::mem::ManuallyDrop;
1+
use crate::cell::UnsafeCell;
22
use crate::ptr;
3-
use crate::sync::atomic::AtomicPtr;
4-
use crate::sync::atomic::Ordering::SeqCst;
3+
use crate::sync::atomic::{
4+
AtomicPtr, AtomicU32,
5+
Ordering::{AcqRel, Acquire, Relaxed, Release},
6+
};
57
use crate::sys::c;
68

7-
pub type Key = c::DWORD;
8-
pub type Dtor = unsafe extern "C" fn(*mut u8);
9+
#[cfg(test)]
10+
mod tests;
11+
12+
type Key = c::DWORD;
13+
type Dtor = unsafe extern "C" fn(*mut u8);
914

1015
// Turns out, like pretty much everything, Windows is pretty close the
1116
// functionality that Unix provides, but slightly different! In the case of
@@ -22,60 +27,109 @@ pub type Dtor = unsafe extern "C" fn(*mut u8);
2227
// To accomplish this feat, we perform a number of threads, all contained
2328
// within this module:
2429
//
25-
// * All TLS destructors are tracked by *us*, not the windows runtime. This
30+
// * All TLS destructors are tracked by *us*, not the Windows runtime. This
2631
// means that we have a global list of destructors for each TLS key that
2732
// we know about.
2833
// * When a thread exits, we run over the entire list and run dtors for all
2934
// non-null keys. This attempts to match Unix semantics in this regard.
3035
//
31-
// This ends up having the overhead of using a global list, having some
32-
// locks here and there, and in general just adding some more code bloat. We
33-
// attempt to optimize runtime by forgetting keys that don't have
34-
// destructors, but this only gets us so far.
35-
//
3636
// For more details and nitty-gritty, see the code sections below!
3737
//
3838
// [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
39-
// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base
40-
// /threading/thread_local_storage_win.cc#L42
39+
// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42
4140

42-
// -------------------------------------------------------------------------
43-
// Native bindings
44-
//
45-
// This section is just raw bindings to the native functions that Windows
46-
// provides, There's a few extra calls to deal with destructors.
41+
pub struct StaticKey {
42+
/// The key value shifted up by one. Since TLS_OUT_OF_INDEXES == DWORD::MAX
43+
/// is not a valid key value, this allows us to use zero as sentinel value
44+
/// without risking overflow.
45+
key: AtomicU32,
46+
dtor: Option<Dtor>,
47+
next: AtomicPtr<StaticKey>,
48+
/// Currently, destructors cannot be unregistered, so we cannot use racy
49+
/// initialization for keys. Instead, we need synchronize initialization.
50+
/// Use the Windows-provided `Once` since it does not require TLS.
51+
once: UnsafeCell<c::INIT_ONCE>,
52+
}
4753

48-
#[inline]
49-
pub unsafe fn create(dtor: Option<Dtor>) -> Key {
50-
let key = c::TlsAlloc();
51-
assert!(key != c::TLS_OUT_OF_INDEXES);
52-
if let Some(f) = dtor {
53-
register_dtor(key, f);
54+
impl StaticKey {
55+
#[inline]
56+
pub const fn new(dtor: Option<Dtor>) -> StaticKey {
57+
StaticKey {
58+
key: AtomicU32::new(0),
59+
dtor,
60+
next: AtomicPtr::new(ptr::null_mut()),
61+
once: UnsafeCell::new(c::INIT_ONCE_STATIC_INIT),
62+
}
5463
}
55-
key
56-
}
5764

58-
#[inline]
59-
pub unsafe fn set(key: Key, value: *mut u8) {
60-
let r = c::TlsSetValue(key, value as c::LPVOID);
61-
debug_assert!(r != 0);
62-
}
65+
#[inline]
66+
pub unsafe fn set(&'static self, val: *mut u8) {
67+
let r = c::TlsSetValue(self.key(), val.cast());
68+
debug_assert_eq!(r, c::TRUE);
69+
}
6370

64-
#[inline]
65-
pub unsafe fn get(key: Key) -> *mut u8 {
66-
c::TlsGetValue(key) as *mut u8
67-
}
71+
#[inline]
72+
pub unsafe fn get(&'static self) -> *mut u8 {
73+
c::TlsGetValue(self.key()).cast()
74+
}
6875

69-
#[inline]
70-
pub unsafe fn destroy(_key: Key) {
71-
rtabort!("can't destroy tls keys on windows")
72-
}
76+
#[inline]
77+
unsafe fn key(&'static self) -> Key {
78+
match self.key.load(Acquire) {
79+
0 => self.init(),
80+
key => key - 1,
81+
}
82+
}
83+
84+
#[cold]
85+
unsafe fn init(&'static self) -> Key {
86+
if self.dtor.is_some() {
87+
let mut pending = c::FALSE;
88+
let r = c::InitOnceBeginInitialize(self.once.get(), 0, &mut pending, ptr::null_mut());
89+
assert_eq!(r, c::TRUE);
7390

74-
#[inline]
75-
pub fn requires_synchronized_create() -> bool {
76-
true
91+
if pending == c::FALSE {
92+
// Some other thread initialized the key, load it.
93+
self.key.load(Relaxed) - 1
94+
} else {
95+
let key = c::TlsAlloc();
96+
if key == c::TLS_OUT_OF_INDEXES {
97+
// Wakeup the waiting threads before panicking to avoid deadlock.
98+
c::InitOnceComplete(self.once.get(), c::INIT_ONCE_INIT_FAILED, ptr::null_mut());
99+
panic!("out of TLS indexes");
100+
}
101+
102+
self.key.store(key + 1, Release);
103+
register_dtor(self);
104+
105+
let r = c::InitOnceComplete(self.once.get(), 0, ptr::null_mut());
106+
debug_assert_eq!(r, c::TRUE);
107+
108+
key
109+
}
110+
} else {
111+
// If there is no destructor to clean up, we can use racy initialization.
112+
113+
let key = c::TlsAlloc();
114+
assert_ne!(key, c::TLS_OUT_OF_INDEXES, "out of TLS indexes");
115+
116+
match self.key.compare_exchange(0, key + 1, AcqRel, Acquire) {
117+
Ok(_) => key,
118+
Err(new) => {
119+
// Some other thread completed initialization first, so destroy
120+
// our key and use theirs.
121+
let r = c::TlsFree(key);
122+
debug_assert_eq!(r, c::TRUE);
123+
new - 1
124+
}
125+
}
126+
}
127+
}
77128
}
78129

130+
unsafe impl Send for StaticKey {}
131+
unsafe impl Sync for StaticKey {}
132+
79133
// -------------------------------------------------------------------------
80134
// Dtor registration
81135
//
@@ -96,29 +150,21 @@ pub fn requires_synchronized_create() -> bool {
96150
// Typically processes have a statically known set of TLS keys which is pretty
97151
// small, and we'd want to keep this memory alive for the whole process anyway
98152
// really.
99-
//
100-
// Perhaps one day we can fold the `Box` here into a static allocation,
101-
// expanding the `StaticKey` structure to contain not only a slot for the TLS
102-
// key but also a slot for the destructor queue on windows. An optimization for
103-
// another day!
104-
105-
static DTORS: AtomicPtr<Node> = AtomicPtr::new(ptr::null_mut());
106-
107-
struct Node {
108-
dtor: Dtor,
109-
key: Key,
110-
next: *mut Node,
111-
}
112153

113-
unsafe fn register_dtor(key: Key, dtor: Dtor) {
114-
let mut node = ManuallyDrop::new(Box::new(Node { key, dtor, next: ptr::null_mut() }));
154+
static DTORS: AtomicPtr<StaticKey> = AtomicPtr::new(ptr::null_mut());
115155

116-
let mut head = DTORS.load(SeqCst);
156+
/// Should only be called once per key, otherwise loops or breaks may occur in
157+
/// the linked list.
158+
unsafe fn register_dtor(key: &'static StaticKey) {
159+
let this = <*const StaticKey>::cast_mut(key);
160+
// Use acquire ordering to pass along the changes done by the previously
161+
// registered keys when we store the new head with release ordering.
162+
let mut head = DTORS.load(Acquire);
117163
loop {
118-
node.next = head;
119-
match DTORS.compare_exchange(head, &mut **node, SeqCst, SeqCst) {
120-
Ok(_) => return, // nothing to drop, we successfully added the node to the list
121-
Err(cur) => head = cur,
164+
key.next.store(head, Relaxed);
165+
match DTORS.compare_exchange_weak(head, this, Release, Acquire) {
166+
Ok(_) => break,
167+
Err(new) => head = new,
122168
}
123169
}
124170
}
@@ -214,25 +260,29 @@ unsafe extern "system" fn on_tls_callback(h: c::LPVOID, dwReason: c::DWORD, pv:
214260
unsafe fn reference_tls_used() {}
215261
}
216262

217-
#[allow(dead_code)] // actually called above
263+
#[allow(dead_code)] // actually called below
218264
unsafe fn run_dtors() {
219-
let mut any_run = true;
220265
for _ in 0..5 {
221-
if !any_run {
222-
break;
223-
}
224-
any_run = false;
225-
let mut cur = DTORS.load(SeqCst);
266+
let mut any_run = false;
267+
268+
// Use acquire ordering to observe key initialization.
269+
let mut cur = DTORS.load(Acquire);
226270
while !cur.is_null() {
227-
let ptr = c::TlsGetValue((*cur).key);
271+
let key = (*cur).key.load(Relaxed) - 1;
272+
let dtor = (*cur).dtor.unwrap();
228273

274+
let ptr = c::TlsGetValue(key);
229275
if !ptr.is_null() {
230-
c::TlsSetValue((*cur).key, ptr::null_mut());
231-
((*cur).dtor)(ptr as *mut _);
276+
c::TlsSetValue(key, ptr::null_mut());
277+
dtor(ptr as *mut _);
232278
any_run = true;
233279
}
234280

235-
cur = (*cur).next;
281+
cur = (*cur).next.load(Relaxed);
282+
}
283+
284+
if !any_run {
285+
break;
236286
}
237287
}
238288
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
use super::StaticKey;
2+
use crate::ptr;
3+
4+
#[test]
5+
fn smoke() {
6+
static K1: StaticKey = StaticKey::new(None);
7+
static K2: StaticKey = StaticKey::new(None);
8+
9+
unsafe {
10+
assert!(K1.get().is_null());
11+
assert!(K2.get().is_null());
12+
K1.set(ptr::invalid_mut(1));
13+
K2.set(ptr::invalid_mut(2));
14+
assert_eq!(K1.get() as usize, 1);
15+
assert_eq!(K2.get() as usize, 2);
16+
}
17+
}
18+
19+
#[test]
20+
fn destructors() {
21+
use crate::mem::ManuallyDrop;
22+
use crate::sync::Arc;
23+
use crate::thread;
24+
25+
unsafe extern "C" fn destruct(ptr: *mut u8) {
26+
drop(Arc::from_raw(ptr as *const ()));
27+
}
28+
29+
static KEY: StaticKey = StaticKey::new(Some(destruct));
30+
31+
let shared1 = Arc::new(());
32+
let shared2 = Arc::clone(&shared1);
33+
34+
unsafe {
35+
assert!(KEY.get().is_null());
36+
KEY.set(Arc::into_raw(shared1) as *mut u8);
37+
}
38+
39+
thread::spawn(move || unsafe {
40+
assert!(KEY.get().is_null());
41+
KEY.set(Arc::into_raw(shared2) as *mut u8);
42+
})
43+
.join()
44+
.unwrap();
45+
46+
// Leak the Arc, let the TLS destructor clean it up.
47+
let shared1 = unsafe { ManuallyDrop::new(Arc::from_raw(KEY.get() as *const ())) };
48+
assert_eq!(
49+
Arc::strong_count(&shared1),
50+
1,
51+
"destructor should have dropped the other reference on thread exit"
52+
);
53+
}

0 commit comments

Comments
 (0)