Skip to content

Commit 54a4c27

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent 56deb7c commit 54a4c27

File tree

1 file changed

+51
-32
lines changed

1 file changed

+51
-32
lines changed

miniaudio.h

+51-32
Original file line numberDiff line numberDiff line change
@@ -42797,7 +42797,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4279742797
}
4279842798
}
4279942799

42800-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42800+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4280142801
{
4280242802
ma_uint64 iSample;
4280342803

@@ -43092,10 +43092,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4309243092
sampleCount = frameCount * channels;
4309343093

4309443094
if (volume == 1) {
43095+
#pragma clang loop vectorize(enable)
4309543096
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4309643097
pDst[iSample] += pSrc[iSample];
4309743098
}
4309843099
} else {
43100+
#pragma clang loop vectorize(enable)
4309943101
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4310043102
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4310143103
}
@@ -45398,7 +45400,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4539845400
const float a1 = pBQ->a1.f32;
4539945401
const float a2 = pBQ->a2.f32;
4540045402

45401-
MA_ASSUME(channels > 0);
45403+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45404+
#pragma clang loop vectorize(assume_safety)
4540245405
for (c = 0; c < channels; c += 1) {
4540345406
float r1 = pBQ->pR1[c].f32;
4540445407
float r2 = pBQ->pR2[c].f32;
@@ -45430,7 +45433,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4543045433
const ma_int32 a1 = pBQ->a1.s32;
4543145434
const ma_int32 a2 = pBQ->a2.s32;
4543245435

45433-
MA_ASSUME(channels > 0);
45436+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45437+
#pragma clang loop vectorize(assume_safety)
4543445438
for (c = 0; c < channels; c += 1) {
4543545439
ma_int32 r1 = pBQ->pR1[c].s32;
4543645440
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45704,22 +45708,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4570445708
return MA_SUCCESS;
4570545709
}
4570645710

45707-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45711+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4570845712
{
4570945713
ma_uint32 c;
4571045714
const ma_uint32 channels = pLPF->channels;
4571145715
const float a = pLPF->a.f32;
4571245716
const float b = 1 - a;
4571345717

45714-
MA_ASSUME(channels > 0);
45718+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45719+
#pragma clang loop vectorize(assume_safety)
4571545720
for (c = 0; c < channels; c += 1) {
4571645721
float r1 = pLPF->pR1[c].f32;
45717-
float x = pX[c];
45722+
float x = pX[c];
4571845723
float y;
4571945724

45720-
y = b*x + a*r1;
45725+
y = b * x + a * r1;
4572145726

45722-
pY[c] = y;
45727+
pY[c] = y;
4572345728
pLPF->pR1[c].f32 = y;
4572445729
}
4572545730
}
@@ -45731,7 +45736,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4573145736
const ma_int32 a = pLPF->a.s32;
4573245737
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4573345738

45734-
MA_ASSUME(channels > 0);
45739+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45740+
#pragma clang loop vectorize(assume_safety)
4573545741
for (c = 0; c < channels; c += 1) {
4573645742
ma_int32 r1 = pLPF->pR1[c].s32;
4573745743
ma_int32 x = pX[c];
@@ -46584,7 +46590,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4658446590
const float a = 1 - pHPF->a.f32;
4658546591
const float b = 1 - a;
4658646592

46587-
MA_ASSUME(channels > 0);
46593+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4658846594
for (c = 0; c < channels; c += 1) {
4658946595
float r1 = pHPF->pR1[c].f32;
4659046596
float x = pX[c];
@@ -46604,7 +46610,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4660446610
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4660546611
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4660646612

46607-
MA_ASSUME(channels > 0);
46613+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4660846614
for (c = 0; c < channels; c += 1) {
4660946615
ma_int32 r1 = pHPF->pR1[c].s32;
4661046616
ma_int32 x = pX[c];
@@ -48712,6 +48718,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4871248718
ma_uint64 iFrame;
4871348719
ma_uint32 iChannel;
4871448720
ma_uint64 interpolatedFrameCount;
48721+
const ma_uint32 channels = pGainer->config.channels;
4871548722

4871648723
MA_ASSERT(pGainer != NULL);
4871748724

@@ -48751,12 +48758,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4875148758
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4875248759
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4875348760

48754-
if (pGainer->config.channels <= 32) {
48761+
if (channels <= 32) {
4875548762
float pRunningGain[32];
4875648763
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4875748764

4875848765
/* Initialize the running gain. */
48759-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48766+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4876048767
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4876148768
pRunningGainDelta[iChannel] = t * d;
4876248769
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48765,7 +48772,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4876548772
iFrame = 0;
4876648773

4876748774
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48768-
if (pGainer->config.channels == 2) {
48775+
if (channels == 2) {
4876948776
#if defined(MA_SUPPORT_SSE2)
4877048777
if (ma_has_sse2()) {
4877148778
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48813,6 +48820,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4881348820

4881448821
iFrame = unrolledLoopCount << 1;
4881548822
#else
48823+
#pragma clang loop vectorize(enable)
4881648824
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4881748825
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4881848826
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48824,7 +48832,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4882448832
}
4882548833
#endif
4882648834
}
48827-
} else if (pGainer->config.channels == 6) {
48835+
} else if (channels == 6) {
4882848836
#if defined(MA_SUPPORT_SSE2)
4882948837
if (ma_has_sse2()) {
4883048838
/*
@@ -48868,7 +48876,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4886848876
}
4886948877
}
4887048878
}
48871-
} else if (pGainer->config.channels == 8) {
48879+
} else if (channels == 8) {
4887248880
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4887348881
#if defined(MA_SUPPORT_SSE2)
4887448882
if (ma_has_sse2()) {
@@ -48889,29 +48897,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4888948897
{
4889048898
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
4889148899
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48900+
#pragma clang loop vectorize(enable)
4889248901
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4889348902
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
4889448903
}
4889548904

4889648905
/* Move the running gain forward towards the new gain. */
48906+
#pragma clang loop vectorize(enable)
4889748907
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4889848908
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4889948909
}
4890048910
}
4890148911
}
4890248912
}
4890348913

48914+
#pragma clang loop unroll(disable)
4890448915
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48905-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48906-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48916+
#pragma clang loop vectorize(enable)
48917+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48918+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4890748919
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4890848920
}
4890948921
}
4891048922
} else {
4891148923
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48924+
#pragma clang loop unroll(disable)
4891248925
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48913-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48914-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48926+
#pragma clang loop vectorize(enable)
48927+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48928+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4891548929
}
4891648930

4891748931
a += d;
@@ -48930,18 +48944,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4893048944

4893148945
/* All we need to do here is apply the new gains using an optimized path. */
4893248946
if (pFramesOut != NULL && pFramesIn != NULL) {
48933-
if (pGainer->config.channels <= 32) {
48947+
if (channels <= 32) {
4893448948
float gains[32];
48935-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48949+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4893648950
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4893748951
}
4893848952

48939-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
48953+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4894048954
} else {
4894148955
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48956+
#pragma clang loop unroll(disable)
4894248957
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48943-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48944-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48958+
#pragma clang loop vectorize(enable)
48959+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48960+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4894548961
}
4894648962
}
4894748963
}
@@ -51343,7 +51359,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5134351359

5134451360
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5134551361

51346-
MA_ASSUME(channels > 0);
51362+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5134751363
for (c = 0; c < channels; c += 1) {
5134851364
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5134951365
pFrameOut[c] = s;
@@ -51362,7 +51378,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5136251378

5136351379
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5136451380

51365-
MA_ASSUME(channels > 0);
51381+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5136651382
for (c = 0; c < channels; c += 1) {
5136751383
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5136851384
pFrameOut[c] = s;
@@ -51533,7 +51549,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
5153351549
}
5153451550

5153551551

51536-
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51552+
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5153751553
{
5153851554
const float* pFramesInF32;
5153951555
/* */ float* pFramesOutF32;
@@ -51609,7 +51625,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
5160951625
return MA_SUCCESS;
5161051626
}
5161151627

51612-
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51628+
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5161351629
{
5161451630
const float* pFramesInF32;
5161551631
/* */ float* pFramesOutF32;
@@ -52922,6 +52938,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5292252938
#endif
5292352939
{
5292452940
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52941+
#pragma clang loop vectorize(enable)
5292552942
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5292652943
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
5292752944
}
@@ -52949,6 +52966,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5294952966
#endif
5295052967
{
5295152968
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52969+
#pragma clang loop vectorize(enable)
5295252970
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5295352971
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
5295452972
}
@@ -52966,6 +52984,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5296652984
#endif
5296752985
{
5296852986
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52987+
#pragma clang loop vectorize(enable)
5296952988
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5297052989
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
5297152990
}
@@ -66711,7 +66730,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6671166730
ma_uint64 iFrame;
6671266731
ma_uint32 iChannel;
6671366732
const ma_uint32 channels = pNoise->config.channels;
66714-
MA_ASSUME(channels > 0);
66733+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6671566734

6671666735
if (pNoise->config.format == ma_format_f32) {
6671766736
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66830,7 +66849,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6683066849
ma_uint64 iFrame;
6683166850
ma_uint32 iChannel;
6683266851
const ma_uint32 channels = pNoise->config.channels;
66833-
MA_ASSUME(channels > 0);
66852+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6683466853

6683566854
if (pNoise->config.format == ma_format_f32) {
6683666855
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66912,7 +66931,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6691266931
ma_uint64 iFrame;
6691366932
ma_uint32 iChannel;
6691466933
const ma_uint32 channels = pNoise->config.channels;
66915-
MA_ASSUME(channels > 0);
66934+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6691666935

6691766936
if (pNoise->config.format == ma_format_f32) {
6691866937
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)