-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable AVX-512 for block unrollings (both copying and zeroing) #85389
Conversation
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch Issue DetailsCloses #83798 This PR enables AVX-512 for various unrollings using Examples: struct MyStruct {
long a,b,c,d,e,f,g,h;
}
// Copying
MyStruct StructCopy(MyStruct s)
{
return s;
}
// Zeroing
void StackallocZeroing()
{
byte* ptr = stackalloc byte[300];
Consume(ptr);
} old codegen: ; Method Tests:StructCopy(Tests+MyStruct):Tests+MyStruct:this
vzeroupper
vmovdqu ymm0, ymmword ptr [r8]
vmovdqu ymmword ptr [rdx], ymm0
vmovdqu ymm0, ymmword ptr [r8+20H]
vmovdqu ymmword ptr [rdx+20H], ymm0
mov rax, rdx
vzeroupper
ret
; Total bytes of code: 30
; Method Tests:StackallocZeroing():this
push rbp
sub rsp, 48
lea rbp, [rsp+20H]
mov rax, 0xD1FFAB1E
mov qword ptr [rbp], rax
test dword ptr [rsp], esp
sub rsp, 304
lea rcx, [rsp+20H]
mov qword ptr [rbp+08H], rcx
xor edx, edx
mov r8d, 304
call CORINFO_HELP_MEMSET
mov rcx, qword ptr [rbp+08H]
call [Tests:Consume(ulong)]
mov rcx, 0xD1FFAB1E
cmp qword ptr [rbp], rcx
je SHORT G_M16409_IG03
call CORINFO_HELP_FAIL_FAST
nop
lea rsp, [rbp+10H]
pop rbp
ret
; Total bytes of code: 94 new codegen: ; Method Tests:StructCopy(Tests+MyStruct):Tests+MyStruct:this
vzeroupper
vmovdqu32 zmm0, zmmword ptr [r8]
vmovdqu32 zmmword ptr [rdx], zmm0
mov rax, rdx
vzeroupper
ret
; Total bytes of code: 22
; Method Tests:StackallocZeroing():this
push rbp
sub rsp, 48
vzeroupper
lea rbp, [rsp+20H]
mov rax, 0xD1FFAB1E
mov qword ptr [rbp+08H], rax
test dword ptr [rsp], esp
sub rsp, 304
lea rcx, [rsp+20H]
vxorps zmm0, zmm0
vmovdqu32 zmmword ptr [rcx], zmm0
vmovdqu32 zmmword ptr [rcx+40H], zmm0
vmovdqu32 zmmword ptr [rcx+80H], zmm0
vmovdqu32 zmmword ptr [rcx+C0H], zmm0
vmovdqu32 zmmword ptr [rcx+F0H], zmm0
call [Tests:Consume(ulong)]
mov rcx, 0xD1FFAB1E
cmp qword ptr [rbp+08H], rcx
je SHORT G_M16409_IG03
call CORINFO_HELP_FAIL_FAST
nop
lea rsp, [rbp+10H]
pop rbp
ret
; Total bytes of code: 119
|
/azp list |
This comment was marked as resolved.
This comment was marked as resolved.
/azp run runtime-coreclr outerloop, runtime-coreclr jitstress-isas-x86 |
Azure Pipelines successfully started running 2 pipeline(s). |
Diffs |
@tannergooding @BruceForstall @dotnet/avx512-contrib PTAL, I didn't enable it for non-zeroing init (e.g. |
Failures are mostly #85403 |
benchmark: [Benchmark]
public void Test()
{
var ptr = stackalloc long[42];
Consume(ptr);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void Consume(void* ptr) { }
}
|
Closes #83798
This PR enables AVX-512 for various unrollings using
GT_BLK
- it can bestackalloc
zeroing, struct copy/initialization,Unsafe.InitBloc/Unsafe.BlockCopy
calls, etc.Examples:
Codegen diff: https://www.diffchecker.com/cxc6UYLf/ (this PR is on the right)
As the result, it increases ranges where we previously used to fallback to
memcpy/memset
calls.Benchmark:
Ryzen 7950x, avx512, win-x64