Skip to content

Commit 01f1f64

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent 35f2246 commit 01f1f64

File tree

1 file changed

+55
-31
lines changed

1 file changed

+55
-31
lines changed

miniaudio.h

+55-31
Original file line numberDiff line numberDiff line change
@@ -42017,7 +42017,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4201742017
}
4201842018
}
4201942019

42020-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42020+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4202142021
{
4202242022
ma_uint64 iSample;
4202342023

@@ -44587,7 +44587,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4458744587
const float a1 = pBQ->a1.f32;
4458844588
const float a2 = pBQ->a2.f32;
4458944589

44590-
MA_ASSUME(channels > 0);
44590+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44591+
#pragma clang loop unroll(disable)
4459144592
for (c = 0; c < channels; c += 1) {
4459244593
float r1 = pBQ->pR1[c].f32;
4459344594
float r2 = pBQ->pR2[c].f32;
@@ -44619,7 +44620,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4461944620
const ma_int32 a1 = pBQ->a1.s32;
4462044621
const ma_int32 a2 = pBQ->a2.s32;
4462144622

44622-
MA_ASSUME(channels > 0);
44623+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44624+
#pragma clang loop unroll(disable)
4462344625
for (c = 0; c < channels; c += 1) {
4462444626
ma_int32 r1 = pBQ->pR1[c].s32;
4462544627
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -44893,22 +44895,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4489344895
return MA_SUCCESS;
4489444896
}
4489544897

44896-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
44898+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4489744899
{
4489844900
ma_uint32 c;
4489944901
const ma_uint32 channels = pLPF->channels;
4490044902
const float a = pLPF->a.f32;
4490144903
const float b = 1 - a;
4490244904

44903-
MA_ASSUME(channels > 0);
44905+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44906+
#pragma clang loop unroll(disable)
4490444907
for (c = 0; c < channels; c += 1) {
4490544908
float r1 = pLPF->pR1[c].f32;
44906-
float x = pX[c];
44909+
float x = pX[c];
4490744910
float y;
4490844911

44909-
y = b*x + a*r1;
44912+
y = b * x + a * r1;
4491044913

44911-
pY[c] = y;
44914+
pY[c] = y;
4491244915
pLPF->pR1[c].f32 = y;
4491344916
}
4491444917
}
@@ -44920,7 +44923,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4492044923
const ma_int32 a = pLPF->a.s32;
4492144924
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4492244925

44923-
MA_ASSUME(channels > 0);
44926+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44927+
#pragma clang loop unroll(disable)
4492444928
for (c = 0; c < channels; c += 1) {
4492544929
ma_int32 r1 = pLPF->pR1[c].s32;
4492644930
ma_int32 x = pX[c];
@@ -45773,7 +45777,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4577345777
const float a = 1 - pHPF->a.f32;
4577445778
const float b = 1 - a;
4577545779

45776-
MA_ASSUME(channels > 0);
45780+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4577745781
for (c = 0; c < channels; c += 1) {
4577845782
float r1 = pHPF->pR1[c].f32;
4577945783
float x = pX[c];
@@ -45793,7 +45797,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4579345797
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4579445798
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4579545799

45796-
MA_ASSUME(channels > 0);
45800+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4579745801
for (c = 0; c < channels; c += 1) {
4579845802
ma_int32 r1 = pHPF->pR1[c].s32;
4579945803
ma_int32 x = pX[c];
@@ -47901,6 +47905,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4790147905
ma_uint64 iFrame;
4790247906
ma_uint32 iChannel;
4790347907
ma_uint64 interpolatedFrameCount;
47908+
const ma_uint32 channels = pGainer->config.channels;
4790447909

4790547910
MA_ASSERT(pGainer != NULL);
4790647911

@@ -47940,12 +47945,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4794047945
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4794147946
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4794247947

47943-
if (pGainer->config.channels <= 32) {
47948+
if (channels <= 32) {
4794447949
float pRunningGain[32];
4794547950
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4794647951

4794747952
/* Initialize the running gain. */
47948-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
47953+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4794947954
float t = (pGainer->pOldGains[iChannel] - pGainer->pNewGains[iChannel]) * pGainer->masterVolume;
4795047955
pRunningGainDelta[iChannel] = t * d;
4795147956
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -47954,7 +47959,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4795447959
iFrame = 0;
4795547960

4795647961
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
47957-
if (pGainer->config.channels == 2) {
47962+
if (channels == 2) {
4795847963
#if defined(MA_SUPPORT_SSE2)
4795947964
if (ma_has_sse2()) {
4796047965
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48002,6 +48007,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4800248007

4800348008
iFrame = unrolledLoopCount << 1;
4800448009
#else
48010+
#pragma clang loop vectorize(enable)
4800548011
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4800648012
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4800748013
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48013,7 +48019,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4801348019
}
4801448020
#endif
4801548021
}
48016-
} else if (pGainer->config.channels == 6) {
48022+
} else if (channels == 6) {
4801748023
#if defined(MA_SUPPORT_SSE2)
4801848024
if (ma_has_sse2()) {
4801948025
/*
@@ -48046,6 +48052,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4804648052
} else
4804748053
#endif
4804848054
{
48055+
#pragma clang loop vectorize(enable)
4804948056
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4805048057
for (iChannel = 0; iChannel < 6; iChannel += 1) {
4805148058
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48057,7 +48064,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4805748064
}
4805848065
}
4805948066
}
48060-
} else if (pGainer->config.channels == 8) {
48067+
} else if (channels == 8) {
4806148068
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4806248069
#if defined(MA_SUPPORT_SSE2)
4806348070
if (ma_has_sse2()) {
@@ -48077,6 +48084,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4807748084
#endif
4807848085
{
4807948086
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48087+
#pragma clang loop vectorize(enable)
4808048088
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4808148089
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4808248090
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48090,17 +48098,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4809048098
}
4809148099
}
4809248100

48101+
#pragma clang loop unroll(disable)
4809348102
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48094-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48095-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48103+
#pragma clang loop vectorize(enable)
48104+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48105+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4809648106
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4809748107
}
4809848108
}
4809948109
} else {
4810048110
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48111+
#pragma clang loop unroll(disable)
4810148112
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48102-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48103-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48113+
#pragma clang loop vectorize(enable)
48114+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48115+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4810448116
}
4810548117

4810648118
a += d;
@@ -48119,18 +48131,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4811948131

4812048132
/* All we need to do here is apply the new gains using an optimized path. */
4812148133
if (pFramesOut != NULL && pFramesIn != NULL) {
48122-
if (pGainer->config.channels <= 32) {
48134+
if (channels <= 32) {
4812348135
float gains[32];
48124-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48136+
#pragma clang loop unroll(disable)
48137+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4812548138
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4812648139
}
4812748140

48128-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
48141+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4812948142
} else {
4813048143
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48144+
#pragma clang loop unroll(disable)
4813148145
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48132-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48133-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48146+
#pragma clang loop vectorize(enable)
48147+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48148+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4813448149
}
4813548150
}
4813648151
}
@@ -50491,7 +50506,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5049150506

5049250507
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5049350508

50494-
MA_ASSUME(channels > 0);
50509+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5049550510
for (c = 0; c < channels; c += 1) {
5049650511
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5049750512
pFrameOut[c] = s;
@@ -50510,7 +50525,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5051050525

5051150526
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5051250527

50513-
MA_ASSUME(channels > 0);
50528+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5051450529
for (c = 0; c < channels; c += 1) {
5051550530
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5051650531
pFrameOut[c] = s;
@@ -51745,6 +51760,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
5174551760
ma_uint64 iFrame;
5174651761
ma_uint32 iChannelOut;
5174751762

51763+
#pragma clang loop unroll(disable)
5174851764
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5174951765
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5175051766
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51765,6 +51781,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
5176551781
ma_uint64 iFrame;
5176651782
ma_uint32 iChannelOut;
5176751783

51784+
#pragma clang loop unroll(disable)
5176851785
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5176951786
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5177051787
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51807,6 +51824,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
5180751824
ma_uint64 iFrame;
5180851825
ma_uint32 iChannelOut;
5180951826

51827+
#pragma clang loop unroll(disable)
5181051828
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5181151829
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5181251830
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51827,6 +51845,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
5182751845
ma_uint64 iFrame;
5182851846
ma_uint32 iChannelOut;
5182951847

51848+
#pragma clang loop unroll(disable)
5183051849
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5183151850
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5183251851
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52061,6 +52080,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5206152080
} else
5206252081
#endif
5206352082
{
52083+
#pragma clang loop vectorize(enable)
5206452084
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5206552085
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5206652086
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52088,6 +52108,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5208852108
} else
5208952109
#endif
5209052110
{
52111+
#pragma clang loop vectorize(enable)
5209152112
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5209252113
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5209352114
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52105,6 +52126,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5210552126
} else
5210652127
#endif
5210752128
{
52129+
#pragma clang loop vectorize(enable)
5210852130
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5210952131
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5211052132
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -65254,7 +65276,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6525465276
ma_uint64 iFrame;
6525565277
ma_uint32 iChannel;
6525665278
const ma_uint32 channels = pNoise->config.channels;
65257-
MA_ASSUME(channels > 0);
65279+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6525865280

6525965281
if (pNoise->config.format == ma_format_f32) {
6526065282
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65373,7 +65395,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6537365395
ma_uint64 iFrame;
6537465396
ma_uint32 iChannel;
6537565397
const ma_uint32 channels = pNoise->config.channels;
65376-
MA_ASSUME(channels > 0);
65398+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6537765399

6537865400
if (pNoise->config.format == ma_format_f32) {
6537965401
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65455,7 +65477,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6545565477
ma_uint64 iFrame;
6545665478
ma_uint32 iChannel;
6545765479
const ma_uint32 channels = pNoise->config.channels;
65458-
MA_ASSUME(channels > 0);
65480+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6545965481

6546065482
if (pNoise->config.format == ma_format_f32) {
6546165483
float* pFramesOutF32 = (float*)pFramesOut;
@@ -69634,7 +69656,7 @@ MA_API void ma_debug_fill_pcm_frames_with_sine_wave(float* pFramesOut, ma_uint32
6963469656

6963569657

6963669658

69637-
static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
69659+
static ma_result ma_mix_pcm_frames_f32(float* MA_RESTRICT pDst, const float* MA_RESTRICT pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
6963869660
{
6963969661
ma_uint64 iSample;
6964069662
ma_uint64 sampleCount;
@@ -69650,10 +69672,12 @@ static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
6965069672
sampleCount = frameCount * channels;
6965169673

6965269674
if (volume == 1) {
69675+
#pragma clang loop vectorize(enable)
6965369676
for (iSample = 0; iSample < sampleCount; iSample += 1) {
6965469677
pDst[iSample] += pSrc[iSample];
6965569678
}
6965669679
} else {
69680+
#pragma clang loop vectorize(enable)
6965769681
for (iSample = 0; iSample < sampleCount; iSample += 1) {
6965869682
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
6965969683
}

0 commit comments

Comments
 (0)