Skip to content

Commit 0eb7086

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent abb592f commit 0eb7086

File tree

1 file changed

+51
-32
lines changed

1 file changed

+51
-32
lines changed

miniaudio.h

+51-32
Original file line numberDiff line numberDiff line change
@@ -42822,7 +42822,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4282242822
}
4282342823
}
4282442824

42825-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42825+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4282642826
{
4282742827
ma_uint64 iSample;
4282842828

@@ -43117,10 +43117,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4311743117
sampleCount = frameCount * channels;
4311843118

4311943119
if (volume == 1) {
43120+
#pragma clang loop vectorize(enable)
4312043121
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4312143122
pDst[iSample] += pSrc[iSample];
4312243123
}
4312343124
} else {
43125+
#pragma clang loop vectorize(enable)
4312443126
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4312543127
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4312643128
}
@@ -45423,7 +45425,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4542345425
const float a1 = pBQ->a1.f32;
4542445426
const float a2 = pBQ->a2.f32;
4542545427

45426-
MA_ASSUME(channels > 0);
45428+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45429+
#pragma clang loop vectorize(assume_safety)
4542745430
for (c = 0; c < channels; c += 1) {
4542845431
float r1 = pBQ->pR1[c].f32;
4542945432
float r2 = pBQ->pR2[c].f32;
@@ -45455,7 +45458,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4545545458
const ma_int32 a1 = pBQ->a1.s32;
4545645459
const ma_int32 a2 = pBQ->a2.s32;
4545745460

45458-
MA_ASSUME(channels > 0);
45461+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45462+
#pragma clang loop vectorize(assume_safety)
4545945463
for (c = 0; c < channels; c += 1) {
4546045464
ma_int32 r1 = pBQ->pR1[c].s32;
4546145465
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45729,22 +45733,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4572945733
return MA_SUCCESS;
4573045734
}
4573145735

45732-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45736+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4573345737
{
4573445738
ma_uint32 c;
4573545739
const ma_uint32 channels = pLPF->channels;
4573645740
const float a = pLPF->a.f32;
4573745741
const float b = 1 - a;
4573845742

45739-
MA_ASSUME(channels > 0);
45743+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45744+
#pragma clang loop vectorize(assume_safety)
4574045745
for (c = 0; c < channels; c += 1) {
4574145746
float r1 = pLPF->pR1[c].f32;
45742-
float x = pX[c];
45747+
float x = pX[c];
4574345748
float y;
4574445749

45745-
y = b*x + a*r1;
45750+
y = b * x + a * r1;
4574645751

45747-
pY[c] = y;
45752+
pY[c] = y;
4574845753
pLPF->pR1[c].f32 = y;
4574945754
}
4575045755
}
@@ -45756,7 +45761,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4575645761
const ma_int32 a = pLPF->a.s32;
4575745762
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4575845763

45759-
MA_ASSUME(channels > 0);
45764+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45765+
#pragma clang loop vectorize(assume_safety)
4576045766
for (c = 0; c < channels; c += 1) {
4576145767
ma_int32 r1 = pLPF->pR1[c].s32;
4576245768
ma_int32 x = pX[c];
@@ -46609,7 +46615,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4660946615
const float a = 1 - pHPF->a.f32;
4661046616
const float b = 1 - a;
4661146617

46612-
MA_ASSUME(channels > 0);
46618+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4661346619
for (c = 0; c < channels; c += 1) {
4661446620
float r1 = pHPF->pR1[c].f32;
4661546621
float x = pX[c];
@@ -46629,7 +46635,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4662946635
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4663046636
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4663146637

46632-
MA_ASSUME(channels > 0);
46638+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4663346639
for (c = 0; c < channels; c += 1) {
4663446640
ma_int32 r1 = pHPF->pR1[c].s32;
4663546641
ma_int32 x = pX[c];
@@ -48737,6 +48743,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4873748743
ma_uint64 iFrame;
4873848744
ma_uint32 iChannel;
4873948745
ma_uint64 interpolatedFrameCount;
48746+
const ma_uint32 channels = pGainer->config.channels;
4874048747

4874148748
MA_ASSERT(pGainer != NULL);
4874248749

@@ -48776,12 +48783,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4877648783
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4877748784
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4877848785

48779-
if (pGainer->config.channels <= 32) {
48786+
if (channels <= 32) {
4878048787
float pRunningGain[32];
4878148788
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4878248789

4878348790
/* Initialize the running gain. */
48784-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48791+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4878548792
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4878648793
pRunningGainDelta[iChannel] = t * d;
4878748794
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48790,7 +48797,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4879048797
iFrame = 0;
4879148798

4879248799
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48793-
if (pGainer->config.channels == 2) {
48800+
if (channels == 2) {
4879448801
#if defined(MA_SUPPORT_SSE2)
4879548802
if (ma_has_sse2()) {
4879648803
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48838,6 +48845,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4883848845

4883948846
iFrame = unrolledLoopCount << 1;
4884048847
#else
48848+
#pragma clang loop vectorize(enable)
4884148849
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4884248850
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4884348851
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48849,7 +48857,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4884948857
}
4885048858
#endif
4885148859
}
48852-
} else if (pGainer->config.channels == 6) {
48860+
} else if (channels == 6) {
4885348861
#if defined(MA_SUPPORT_SSE2)
4885448862
if (ma_has_sse2()) {
4885548863
/*
@@ -48893,7 +48901,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4889348901
}
4889448902
}
4889548903
}
48896-
} else if (pGainer->config.channels == 8) {
48904+
} else if (channels == 8) {
4889748905
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4889848906
#if defined(MA_SUPPORT_SSE2)
4889948907
if (ma_has_sse2()) {
@@ -48914,29 +48922,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4891448922
{
4891548923
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
4891648924
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48925+
#pragma clang loop vectorize(enable)
4891748926
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4891848927
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
4891948928
}
4892048929

4892148930
/* Move the running gain forward towards the new gain. */
48931+
#pragma clang loop vectorize(enable)
4892248932
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4892348933
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4892448934
}
4892548935
}
4892648936
}
4892748937
}
4892848938

48939+
#pragma clang loop unroll(disable)
4892948940
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48930-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48931-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48941+
#pragma clang loop vectorize(enable)
48942+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48943+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4893248944
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4893348945
}
4893448946
}
4893548947
} else {
4893648948
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48949+
#pragma clang loop unroll(disable)
4893748950
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48938-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48939-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48951+
#pragma clang loop vectorize(enable)
48952+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48953+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4894048954
}
4894148955

4894248956
a += d;
@@ -48955,18 +48969,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4895548969

4895648970
/* All we need to do here is apply the new gains using an optimized path. */
4895748971
if (pFramesOut != NULL && pFramesIn != NULL) {
48958-
if (pGainer->config.channels <= 32) {
48972+
if (channels <= 32) {
4895948973
float gains[32];
48960-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48974+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4896148975
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4896248976
}
4896348977

48964-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
48978+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4896548979
} else {
4896648980
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48981+
#pragma clang loop unroll(disable)
4896748982
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48968-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48969-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48983+
#pragma clang loop vectorize(enable)
48984+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48985+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4897048986
}
4897148987
}
4897248988
}
@@ -51368,7 +51384,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5136851384

5136951385
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5137051386

51371-
MA_ASSUME(channels > 0);
51387+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5137251388
for (c = 0; c < channels; c += 1) {
5137351389
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5137451390
pFrameOut[c] = s;
@@ -51387,7 +51403,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5138751403

5138851404
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5138951405

51390-
MA_ASSUME(channels > 0);
51406+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5139151407
for (c = 0; c < channels; c += 1) {
5139251408
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5139351409
pFrameOut[c] = s;
@@ -51558,7 +51574,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
5155851574
}
5155951575

5156051576

51561-
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51577+
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5156251578
{
5156351579
const float* pFramesInF32;
5156451580
/* */ float* pFramesOutF32;
@@ -51634,7 +51650,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
5163451650
return MA_SUCCESS;
5163551651
}
5163651652

51637-
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51653+
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5163851654
{
5163951655
const float* pFramesInF32;
5164051656
/* */ float* pFramesOutF32;
@@ -52947,6 +52963,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5294752963
#endif
5294852964
{
5294952965
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52966+
#pragma clang loop vectorize(enable)
5295052967
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5295152968
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
5295252969
}
@@ -52974,6 +52991,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5297452991
#endif
5297552992
{
5297652993
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52994+
#pragma clang loop vectorize(enable)
5297752995
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5297852996
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
5297952997
}
@@ -52991,6 +53009,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5299153009
#endif
5299253010
{
5299353011
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53012+
#pragma clang loop vectorize(enable)
5299453013
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5299553014
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
5299653015
}
@@ -66736,7 +66755,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6673666755
ma_uint64 iFrame;
6673766756
ma_uint32 iChannel;
6673866757
const ma_uint32 channels = pNoise->config.channels;
66739-
MA_ASSUME(channels > 0);
66758+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6674066759

6674166760
if (pNoise->config.format == ma_format_f32) {
6674266761
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66855,7 +66874,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6685566874
ma_uint64 iFrame;
6685666875
ma_uint32 iChannel;
6685766876
const ma_uint32 channels = pNoise->config.channels;
66858-
MA_ASSUME(channels > 0);
66877+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6685966878

6686066879
if (pNoise->config.format == ma_format_f32) {
6686166880
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66937,7 +66956,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6693766956
ma_uint64 iFrame;
6693866957
ma_uint32 iChannel;
6693966958
const ma_uint32 channels = pNoise->config.channels;
66940-
MA_ASSUME(channels > 0);
66959+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6694166960

6694266961
if (pNoise->config.format == ma_format_f32) {
6694366962
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)