Skip to content

Commit

Permalink
runtime: add ERMS-based memmove support for modern CPU platforms
Browse files Browse the repository at this point in the history
The current memmove implementation uses REP MOVSB to copy data larger than
2KB when the useAVXmemmove global variable is false and the CPU supports
the ERMS feature.

This feature is currently only enabled on CPUs in the Sandy Bridge (Client)
, Sandy Bridge (Server), Ivy Bridge (Client), and Ivy Bridge (Server)
microarchitectures.

For modern Intel CPU microarchitectures that support the ERMS feature, such
as Ice Lake (Server), Sapphire Rapids , REP MOVSB achieves better
performance than the AVX-based copy currently implemented in memmove.

Benchstat result:

goos: linux
goarch: amd64
pkg: runtime
cpu: Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz
               │  ./old.txt  │              ./new.txt              │
               │   sec/op    │   sec/op     vs base                │
Memmove/2048-2   25.24n ± 0%   24.27n ± 0%   -3.84% (p=0.000 n=10)
Memmove/4096-2   44.87n ± 0%   33.16n ± 1%  -26.11% (p=0.000 n=10)
geomean          33.65n        28.37n       -15.71%

               │  ./old.txt   │               ./new.txt               │
               │     B/s      │      B/s       vs base                │
Memmove/2048-2   75.56Gi ± 0%    78.59Gi ± 0%   +4.02% (p=0.000 n=10)
Memmove/4096-2   85.01Gi ± 0%   115.05Gi ± 1%  +35.34% (p=0.000 n=10)
geomean          80.14Gi         95.09Gi       +18.65%

Fixes #66958

Change-Id: I1fafd1b51a16752f83ac15047cf3b29422a79d5d
GitHub-Last-Rev: 89cf5af
GitHub-Pull-Request: #66959
Reviewed-on: https://go-review.googlesource.com/c/go/+/580735
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: Keith Randall <[email protected]>
Reviewed-by: Keith Randall <[email protected]>
Auto-Submit: Keith Randall <[email protected]>
Reviewed-by: Cherry Mui <[email protected]>
  • Loading branch information
cocotyty authored and gopherbot committed Jul 22, 2024
1 parent 20e18c9 commit 601ea46
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 25 deletions.
1 change: 1 addition & 0 deletions src/internal/cpu/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ var X86 struct {
HasBMI1 bool
HasBMI2 bool
HasERMS bool
HasFSRM bool
HasFMA bool
HasOSXSAVE bool
HasPCLMULQDQ bool
Expand Down
8 changes: 6 additions & 2 deletions src/internal/cpu/cpu_x86.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ const (
cpuid_SHA = 1 << 29
cpuid_AVX512BW = 1 << 30
cpuid_AVX512VL = 1 << 31

// edx bits
cpuid_FSRM = 1 << 4
// edx bits for CPUID 0x80000001
cpuid_RDTSCP = 1 << 27
)
Expand All @@ -52,6 +53,7 @@ func doinit() {
{Name: "adx", Feature: &X86.HasADX},
{Name: "aes", Feature: &X86.HasAES},
{Name: "erms", Feature: &X86.HasERMS},
{Name: "fsrm", Feature: &X86.HasFSRM},
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
{Name: "sha", Feature: &X86.HasSHA},
Expand Down Expand Up @@ -137,7 +139,7 @@ func doinit() {
return
}

_, ebx7, _, _ := cpuid(7, 0)
_, ebx7, _, edx7 := cpuid(7, 0)
X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
Expand All @@ -151,6 +153,8 @@ func doinit() {
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
}

X86.HasFSRM = isSet(edx7, cpuid_FSRM)

var maxExtendedInformation uint32
maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0)

Expand Down
34 changes: 24 additions & 10 deletions src/runtime/cpuflags_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,31 @@ import (
"internal/cpu"
)

var useAVXmemmove bool
var memmoveBits uint8

func init() {
// Let's remove stepping and reserved fields
processor := processorVersionInfo & 0x0FFF3FF0
const (
// avxSupported indicates that the CPU supports AVX instructions.
avxSupported = 1 << 0

isIntelBridgeFamily := isIntel &&
processor == 0x206A0 ||
processor == 0x206D0 ||
processor == 0x306A0 ||
processor == 0x306E0
// repmovsPreferred indicates that REP MOVSx instruction is more
// efficient on the CPU.
repmovsPreferred = 1 << 1
)

useAVXmemmove = cpu.X86.HasAVX && !isIntelBridgeFamily
func init() {
// Here we assume that on modern CPUs with both FSRM and ERMS features,
// copying data blocks of 2KB or larger using the REP MOVSB instruction
// will be more efficient to avoid having to keep up with CPU generations.
// Therefore, we may retain a BlockList mechanism to ensure that microarchitectures
// that do not fit this case may appear in the future.
// We enable it on Intel CPUs first, and we may support more platforms
// in the future.
isERMSNiceCPU := isIntel
useREPMOV := isERMSNiceCPU && cpu.X86.HasERMS && cpu.X86.HasFSRM
if cpu.X86.HasAVX {
memmoveBits |= avxSupported
}
if useREPMOV {
memmoveBits |= repmovsPreferred
}
}
41 changes: 28 additions & 13 deletions src/runtime/memmove_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -72,26 +72,34 @@ tail:
CMPQ BX, $256
JBE move_129through256

TESTB $1, runtime·useAVXmemmove(SB)
JNZ avxUnaligned

MOVB runtime·memmoveBits(SB), AX
// We have AVX but we don't want to use REP MOVSx.
CMPB AX, $const_avxSupported
JEQ avxUnaligned
/*
* check and set for backwards
*/
CMPQ SI, DI
JLS back

/*
* forward copy loop
*/
* forward copy loop
*/
forward:
CMPQ BX, $2048
JLS move_256through2048

// If REP MOVSB isn't fast, don't use it
CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
JNE fwdBy8
JL check_avx
// REP MOVSx is slow if destination address is unaligned.
TESTQ $15,DI
JNZ check_avx
TESTB $const_repmovsPreferred, AX
JNZ fwdBy8
// For backward copy, REP MOVSx performs worse than avx.
check_avx:
TESTB $const_avxSupported, AX
JNZ avxUnaligned

CMPQ BX, $2048
JLS move_256through2048
// Check alignment
MOVL SI, AX
ORL DI, AX
Expand All @@ -104,12 +112,16 @@ forward:
RET

fwdBy8:
// Loading the last (possibly partially overlapping) word and writing
// it at the end.
MOVQ -8(SI)(BX*1), AX
LEAQ -8(DI)(BX*1), DX
// Do 8 bytes at a time
MOVQ BX, CX
LEAQ -1(BX),CX
SHRQ $3, CX
ANDQ $7, BX
REP; MOVSQ
JMP tail
MOVQ AX, (DX)
RET

back:
/*
Expand All @@ -119,6 +131,9 @@ back:
ADDQ BX, CX
CMPQ CX, DI
JLS forward

TESTB $const_avxSupported, AX
JNZ avxUnaligned
/*
* whole thing backwards has
* adjusted addresses
Expand Down

0 comments on commit 601ea46

Please sign in to comment.