Skip to content

Commit 5cd3bc3

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent 8dd8320 commit 5cd3bc3

File tree

1 file changed

+51
-32
lines changed

1 file changed

+51
-32
lines changed

miniaudio.h

+51-32
Original file line numberDiff line numberDiff line change
@@ -42913,7 +42913,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4291342913
}
4291442914
}
4291542915

42916-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42916+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4291742917
{
4291842918
ma_uint64 iSample;
4291942919

@@ -43208,10 +43208,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4320843208
sampleCount = frameCount * channels;
4320943209

4321043210
if (volume == 1) {
43211+
#pragma clang loop vectorize(enable)
4321143212
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4321243213
pDst[iSample] += pSrc[iSample];
4321343214
}
4321443215
} else {
43216+
#pragma clang loop vectorize(enable)
4321543217
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4321643218
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4321743219
}
@@ -45514,7 +45516,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4551445516
const float a1 = pBQ->a1.f32;
4551545517
const float a2 = pBQ->a2.f32;
4551645518

45517-
MA_ASSUME(channels > 0);
45519+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45520+
#pragma clang loop vectorize(assume_safety)
4551845521
for (c = 0; c < channels; c += 1) {
4551945522
float r1 = pBQ->pR1[c].f32;
4552045523
float r2 = pBQ->pR2[c].f32;
@@ -45546,7 +45549,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4554645549
const ma_int32 a1 = pBQ->a1.s32;
4554745550
const ma_int32 a2 = pBQ->a2.s32;
4554845551

45549-
MA_ASSUME(channels > 0);
45552+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45553+
#pragma clang loop vectorize(assume_safety)
4555045554
for (c = 0; c < channels; c += 1) {
4555145555
ma_int32 r1 = pBQ->pR1[c].s32;
4555245556
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45820,22 +45824,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4582045824
return MA_SUCCESS;
4582145825
}
4582245826

45823-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45827+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4582445828
{
4582545829
ma_uint32 c;
4582645830
const ma_uint32 channels = pLPF->channels;
4582745831
const float a = pLPF->a.f32;
4582845832
const float b = 1 - a;
4582945833

45830-
MA_ASSUME(channels > 0);
45834+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45835+
#pragma clang loop vectorize(assume_safety)
4583145836
for (c = 0; c < channels; c += 1) {
4583245837
float r1 = pLPF->pR1[c].f32;
45833-
float x = pX[c];
45838+
float x = pX[c];
4583445839
float y;
4583545840

45836-
y = b*x + a*r1;
45841+
y = b * x + a * r1;
4583745842

45838-
pY[c] = y;
45843+
pY[c] = y;
4583945844
pLPF->pR1[c].f32 = y;
4584045845
}
4584145846
}
@@ -45847,7 +45852,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4584745852
const ma_int32 a = pLPF->a.s32;
4584845853
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4584945854

45850-
MA_ASSUME(channels > 0);
45855+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45856+
#pragma clang loop vectorize(assume_safety)
4585145857
for (c = 0; c < channels; c += 1) {
4585245858
ma_int32 r1 = pLPF->pR1[c].s32;
4585345859
ma_int32 x = pX[c];
@@ -46700,7 +46706,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4670046706
const float a = 1 - pHPF->a.f32;
4670146707
const float b = 1 - a;
4670246708

46703-
MA_ASSUME(channels > 0);
46709+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4670446710
for (c = 0; c < channels; c += 1) {
4670546711
float r1 = pHPF->pR1[c].f32;
4670646712
float x = pX[c];
@@ -46720,7 +46726,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4672046726
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4672146727
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4672246728

46723-
MA_ASSUME(channels > 0);
46729+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4672446730
for (c = 0; c < channels; c += 1) {
4672546731
ma_int32 r1 = pHPF->pR1[c].s32;
4672646732
ma_int32 x = pX[c];
@@ -48828,6 +48834,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4882848834
ma_uint64 iFrame;
4882948835
ma_uint32 iChannel;
4883048836
ma_uint64 interpolatedFrameCount;
48837+
const ma_uint32 channels = pGainer->config.channels;
4883148838

4883248839
MA_ASSERT(pGainer != NULL);
4883348840

@@ -48867,12 +48874,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4886748874
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4886848875
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4886948876

48870-
if (pGainer->config.channels <= 32) {
48877+
if (channels <= 32) {
4887148878
float pRunningGain[32];
4887248879
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4887348880

4887448881
/* Initialize the running gain. */
48875-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48882+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4887648883
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4887748884
pRunningGainDelta[iChannel] = t * d;
4887848885
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48881,7 +48888,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4888148888
iFrame = 0;
4888248889

4888348890
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48884-
if (pGainer->config.channels == 2) {
48891+
if (channels == 2) {
4888548892
#if defined(MA_SUPPORT_SSE2)
4888648893
if (ma_has_sse2()) {
4888748894
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48929,6 +48936,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4892948936

4893048937
iFrame = unrolledLoopCount << 1;
4893148938
#else
48939+
#pragma clang loop vectorize(enable)
4893248940
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4893348941
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4893448942
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48940,7 +48948,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4894048948
}
4894148949
#endif
4894248950
}
48943-
} else if (pGainer->config.channels == 6) {
48951+
} else if (channels == 6) {
4894448952
#if defined(MA_SUPPORT_SSE2)
4894548953
if (ma_has_sse2()) {
4894648954
/*
@@ -48984,7 +48992,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4898448992
}
4898548993
}
4898648994
}
48987-
} else if (pGainer->config.channels == 8) {
48995+
} else if (channels == 8) {
4898848996
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4898948997
#if defined(MA_SUPPORT_SSE2)
4899048998
if (ma_has_sse2()) {
@@ -49005,29 +49013,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4900549013
{
4900649014
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
4900749015
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
49016+
#pragma clang loop vectorize(enable)
4900849017
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4900949018
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
4901049019
}
4901149020

4901249021
/* Move the running gain forward towards the new gain. */
49022+
#pragma clang loop vectorize(enable)
4901349023
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4901449024
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4901549025
}
4901649026
}
4901749027
}
4901849028
}
4901949029

49030+
#pragma clang loop unroll(disable)
4902049031
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
49021-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49022-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
49032+
#pragma clang loop vectorize(enable)
49033+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49034+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4902349035
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4902449036
}
4902549037
}
4902649038
} else {
4902749039
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
49040+
#pragma clang loop unroll(disable)
4902849041
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
49029-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49030-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49042+
#pragma clang loop vectorize(enable)
49043+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49044+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4903149045
}
4903249046

4903349047
a += d;
@@ -49046,18 +49060,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4904649060

4904749061
/* All we need to do here is apply the new gains using an optimized path. */
4904849062
if (pFramesOut != NULL && pFramesIn != NULL) {
49049-
if (pGainer->config.channels <= 32) {
49063+
if (channels <= 32) {
4905049064
float gains[32];
49051-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49065+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4905249066
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4905349067
}
4905449068

49055-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
49069+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4905649070
} else {
4905749071
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49072+
#pragma clang loop unroll(disable)
4905849073
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49059-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49060-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49074+
#pragma clang loop vectorize(enable)
49075+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49076+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4906149077
}
4906249078
}
4906349079
}
@@ -51459,7 +51475,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5145951475

5146051476
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5146151477

51462-
MA_ASSUME(channels > 0);
51478+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5146351479
for (c = 0; c < channels; c += 1) {
5146451480
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5146551481
pFrameOut[c] = s;
@@ -51478,7 +51494,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5147851494

5147951495
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5148051496

51481-
MA_ASSUME(channels > 0);
51497+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5148251498
for (c = 0; c < channels; c += 1) {
5148351499
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5148451500
pFrameOut[c] = s;
@@ -51649,7 +51665,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
5164951665
}
5165051666

5165151667

51652-
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51668+
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5165351669
{
5165451670
const float* pFramesInF32;
5165551671
/* */ float* pFramesOutF32;
@@ -51725,7 +51741,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
5172551741
return MA_SUCCESS;
5172651742
}
5172751743

51728-
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51744+
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5172951745
{
5173051746
const float* pFramesInF32;
5173151747
/* */ float* pFramesOutF32;
@@ -53038,6 +53054,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5303853054
#endif
5303953055
{
5304053056
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53057+
#pragma clang loop vectorize(enable)
5304153058
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5304253059
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
5304353060
}
@@ -53065,6 +53082,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5306553082
#endif
5306653083
{
5306753084
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53085+
#pragma clang loop vectorize(enable)
5306853086
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5306953087
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
5307053088
}
@@ -53082,6 +53100,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5308253100
#endif
5308353101
{
5308453102
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53103+
#pragma clang loop vectorize(enable)
5308553104
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5308653105
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
5308753106
}
@@ -66827,7 +66846,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6682766846
ma_uint64 iFrame;
6682866847
ma_uint32 iChannel;
6682966848
const ma_uint32 channels = pNoise->config.channels;
66830-
MA_ASSUME(channels > 0);
66849+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6683166850

6683266851
if (pNoise->config.format == ma_format_f32) {
6683366852
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66946,7 +66965,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6694666965
ma_uint64 iFrame;
6694766966
ma_uint32 iChannel;
6694866967
const ma_uint32 channels = pNoise->config.channels;
66949-
MA_ASSUME(channels > 0);
66968+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6695066969

6695166970
if (pNoise->config.format == ma_format_f32) {
6695266971
float* pFramesOutF32 = (float*)pFramesOut;
@@ -67028,7 +67047,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6702867047
ma_uint64 iFrame;
6702967048
ma_uint32 iChannel;
6703067049
const ma_uint32 channels = pNoise->config.channels;
67031-
MA_ASSUME(channels > 0);
67050+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6703267051

6703367052
if (pNoise->config.format == ma_format_f32) {
6703467053
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)