Skip to content

Commit 2a9fec4

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent 5c3816a commit 2a9fec4

File tree

1 file changed

+54
-30
lines changed

1 file changed

+54
-30
lines changed

miniaudio.h

+54-30
Original file line numberDiff line numberDiff line change
@@ -42819,7 +42819,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4281942819
}
4282042820
}
4282142821

42822-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42822+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4282342823
{
4282442824
ma_uint64 iSample;
4282542825

@@ -43114,10 +43114,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4311443114
sampleCount = frameCount * channels;
4311543115

4311643116
if (volume == 1) {
43117+
#pragma clang loop vectorize(enable)
4311743118
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4311843119
pDst[iSample] += pSrc[iSample];
4311943120
}
4312043121
} else {
43122+
#pragma clang loop vectorize(enable)
4312143123
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4312243124
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4312343125
}
@@ -45418,7 +45420,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4541845420
const float a1 = pBQ->a1.f32;
4541945421
const float a2 = pBQ->a2.f32;
4542045422

45421-
MA_ASSUME(channels > 0);
45423+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45424+
#pragma clang loop unroll(disable)
4542245425
for (c = 0; c < channels; c += 1) {
4542345426
float r1 = pBQ->pR1[c].f32;
4542445427
float r2 = pBQ->pR2[c].f32;
@@ -45450,7 +45453,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4545045453
const ma_int32 a1 = pBQ->a1.s32;
4545145454
const ma_int32 a2 = pBQ->a2.s32;
4545245455

45453-
MA_ASSUME(channels > 0);
45456+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45457+
#pragma clang loop unroll(disable)
4545445458
for (c = 0; c < channels; c += 1) {
4545545459
ma_int32 r1 = pBQ->pR1[c].s32;
4545645460
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45724,22 +45728,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4572445728
return MA_SUCCESS;
4572545729
}
4572645730

45727-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45731+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4572845732
{
4572945733
ma_uint32 c;
4573045734
const ma_uint32 channels = pLPF->channels;
4573145735
const float a = pLPF->a.f32;
4573245736
const float b = 1 - a;
4573345737

45734-
MA_ASSUME(channels > 0);
45738+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45739+
#pragma clang loop unroll(disable)
4573545740
for (c = 0; c < channels; c += 1) {
4573645741
float r1 = pLPF->pR1[c].f32;
45737-
float x = pX[c];
45742+
float x = pX[c];
4573845743
float y;
4573945744

45740-
y = b*x + a*r1;
45745+
y = b * x + a * r1;
4574145746

45742-
pY[c] = y;
45747+
pY[c] = y;
4574345748
pLPF->pR1[c].f32 = y;
4574445749
}
4574545750
}
@@ -45751,7 +45756,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4575145756
const ma_int32 a = pLPF->a.s32;
4575245757
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4575345758

45754-
MA_ASSUME(channels > 0);
45759+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45760+
#pragma clang loop unroll(disable)
4575545761
for (c = 0; c < channels; c += 1) {
4575645762
ma_int32 r1 = pLPF->pR1[c].s32;
4575745763
ma_int32 x = pX[c];
@@ -46604,7 +46610,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4660446610
const float a = 1 - pHPF->a.f32;
4660546611
const float b = 1 - a;
4660646612

46607-
MA_ASSUME(channels > 0);
46613+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4660846614
for (c = 0; c < channels; c += 1) {
4660946615
float r1 = pHPF->pR1[c].f32;
4661046616
float x = pX[c];
@@ -46624,7 +46630,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4662446630
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4662546631
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4662646632

46627-
MA_ASSUME(channels > 0);
46633+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4662846634
for (c = 0; c < channels; c += 1) {
4662946635
ma_int32 r1 = pHPF->pR1[c].s32;
4663046636
ma_int32 x = pX[c];
@@ -48732,6 +48738,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4873248738
ma_uint64 iFrame;
4873348739
ma_uint32 iChannel;
4873448740
ma_uint64 interpolatedFrameCount;
48741+
const ma_uint32 channels = pGainer->config.channels;
4873548742

4873648743
MA_ASSERT(pGainer != NULL);
4873748744

@@ -48771,12 +48778,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4877148778
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4877248779
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4877348780

48774-
if (pGainer->config.channels <= 32) {
48781+
if (channels <= 32) {
4877548782
float pRunningGain[32];
4877648783
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4877748784

4877848785
/* Initialize the running gain. */
48779-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48786+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4878048787
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4878148788
pRunningGainDelta[iChannel] = t * d;
4878248789
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48785,7 +48792,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4878548792
iFrame = 0;
4878648793

4878748794
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48788-
if (pGainer->config.channels == 2) {
48795+
if (channels == 2) {
4878948796
#if defined(MA_SUPPORT_SSE2)
4879048797
if (ma_has_sse2()) {
4879148798
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48833,6 +48840,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4883348840

4883448841
iFrame = unrolledLoopCount << 1;
4883548842
#else
48843+
#pragma clang loop vectorize(enable)
4883648844
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4883748845
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4883848846
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48844,7 +48852,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4884448852
}
4884548853
#endif
4884648854
}
48847-
} else if (pGainer->config.channels == 6) {
48855+
} else if (channels == 6) {
4884848856
#if defined(MA_SUPPORT_SSE2)
4884948857
if (ma_has_sse2()) {
4885048858
/*
@@ -48877,6 +48885,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4887748885
} else
4887848886
#endif
4887948887
{
48888+
#pragma clang loop vectorize(enable)
4888048889
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4888148890
for (iChannel = 0; iChannel < 6; iChannel += 1) {
4888248891
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48888,7 +48897,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4888848897
}
4888948898
}
4889048899
}
48891-
} else if (pGainer->config.channels == 8) {
48900+
} else if (channels == 8) {
4889248901
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4889348902
#if defined(MA_SUPPORT_SSE2)
4889448903
if (ma_has_sse2()) {
@@ -48908,6 +48917,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4890848917
#endif
4890948918
{
4891048919
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48920+
#pragma clang loop vectorize(enable)
4891148921
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4891248922
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4891348923
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48921,17 +48931,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4892148931
}
4892248932
}
4892348933

48934+
#pragma clang loop unroll(disable)
4892448935
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48925-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48926-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48936+
#pragma clang loop vectorize(enable)
48937+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48938+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4892748939
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4892848940
}
4892948941
}
4893048942
} else {
4893148943
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48944+
#pragma clang loop unroll(disable)
4893248945
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48933-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48934-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48946+
#pragma clang loop vectorize(enable)
48947+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48948+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4893548949
}
4893648950

4893748951
a += d;
@@ -48950,18 +48964,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4895048964

4895148965
/* All we need to do here is apply the new gains using an optimized path. */
4895248966
if (pFramesOut != NULL && pFramesIn != NULL) {
48953-
if (pGainer->config.channels <= 32) {
48967+
if (channels <= 32) {
4895448968
float gains[32];
48955-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48969+
#pragma clang loop unroll(disable)
48970+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4895648971
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4895748972
}
4895848973

48959-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
48974+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4896048975
} else {
4896148976
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48977+
#pragma clang loop unroll(disable)
4896248978
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48963-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48964-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48979+
#pragma clang loop vectorize(enable)
48980+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48981+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4896548982
}
4896648983
}
4896748984
}
@@ -51331,7 +51348,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5133151348

5133251349
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5133351350

51334-
MA_ASSUME(channels > 0);
51351+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5133551352
for (c = 0; c < channels; c += 1) {
5133651353
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5133751354
pFrameOut[c] = s;
@@ -51350,7 +51367,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5135051367

5135151368
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5135251369

51353-
MA_ASSUME(channels > 0);
51370+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5135451371
for (c = 0; c < channels; c += 1) {
5135551372
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5135651373
pFrameOut[c] = s;
@@ -52585,6 +52602,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
5258552602
ma_uint64 iFrame;
5258652603
ma_uint32 iChannelOut;
5258752604

52605+
#pragma clang loop unroll(disable)
5258852606
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5258952607
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5259052608
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52605,6 +52623,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
5260552623
ma_uint64 iFrame;
5260652624
ma_uint32 iChannelOut;
5260752625

52626+
#pragma clang loop unroll(disable)
5260852627
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5260952628
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5261052629
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52647,6 +52666,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
5264752666
ma_uint64 iFrame;
5264852667
ma_uint32 iChannelOut;
5264952668

52669+
#pragma clang loop unroll(disable)
5265052670
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5265152671
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5265252672
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52667,6 +52687,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
5266752687
ma_uint64 iFrame;
5266852688
ma_uint32 iChannelOut;
5266952689

52690+
#pragma clang loop unroll(disable)
5267052691
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5267152692
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5267252693
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52901,6 +52922,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5290152922
} else
5290252923
#endif
5290352924
{
52925+
#pragma clang loop vectorize(enable)
5290452926
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5290552927
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5290652928
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52928,6 +52950,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5292852950
} else
5292952951
#endif
5293052952
{
52953+
#pragma clang loop vectorize(enable)
5293152954
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5293252955
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5293352956
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52945,6 +52968,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5294552968
} else
5294652969
#endif
5294752970
{
52971+
#pragma clang loop vectorize(enable)
5294852972
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5294952973
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5295052974
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66268,7 +66292,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6626866292
ma_uint64 iFrame;
6626966293
ma_uint32 iChannel;
6627066294
const ma_uint32 channels = pNoise->config.channels;
66271-
MA_ASSUME(channels > 0);
66295+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6627266296

6627366297
if (pNoise->config.format == ma_format_f32) {
6627466298
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66387,7 +66411,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6638766411
ma_uint64 iFrame;
6638866412
ma_uint32 iChannel;
6638966413
const ma_uint32 channels = pNoise->config.channels;
66390-
MA_ASSUME(channels > 0);
66414+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6639166415

6639266416
if (pNoise->config.format == ma_format_f32) {
6639366417
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66469,7 +66493,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6646966493
ma_uint64 iFrame;
6647066494
ma_uint32 iChannel;
6647166495
const ma_uint32 channels = pNoise->config.channels;
66472-
MA_ASSUME(channels > 0);
66496+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6647366497

6647466498
if (pNoise->config.format == ma_format_f32) {
6647566499
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)