@@ -42819,7 +42819,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42819
42819
}
42820
42820
}
42821
42821
42822
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42822
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42823
42823
{
42824
42824
ma_uint64 iSample;
42825
42825
@@ -43114,10 +43114,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43114
43114
sampleCount = frameCount * channels;
43115
43115
43116
43116
if (volume == 1) {
43117
+ #pragma clang loop vectorize(enable)
43117
43118
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43118
43119
pDst[iSample] += pSrc[iSample];
43119
43120
}
43120
43121
} else {
43122
+ #pragma clang loop vectorize(enable)
43121
43123
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43122
43124
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43123
43125
}
@@ -45418,7 +45420,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45418
45420
const float a1 = pBQ->a1.f32;
45419
45421
const float a2 = pBQ->a2.f32;
45420
45422
45421
- MA_ASSUME(channels > 0);
45423
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45424
+ #pragma clang loop unroll(disable)
45422
45425
for (c = 0; c < channels; c += 1) {
45423
45426
float r1 = pBQ->pR1[c].f32;
45424
45427
float r2 = pBQ->pR2[c].f32;
@@ -45450,7 +45453,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45450
45453
const ma_int32 a1 = pBQ->a1.s32;
45451
45454
const ma_int32 a2 = pBQ->a2.s32;
45452
45455
45453
- MA_ASSUME(channels > 0);
45456
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45457
+ #pragma clang loop unroll(disable)
45454
45458
for (c = 0; c < channels; c += 1) {
45455
45459
ma_int32 r1 = pBQ->pR1[c].s32;
45456
45460
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45724,22 +45728,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45724
45728
return MA_SUCCESS;
45725
45729
}
45726
45730
45727
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45731
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45728
45732
{
45729
45733
ma_uint32 c;
45730
45734
const ma_uint32 channels = pLPF->channels;
45731
45735
const float a = pLPF->a.f32;
45732
45736
const float b = 1 - a;
45733
45737
45734
- MA_ASSUME(channels > 0);
45738
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45739
+ #pragma clang loop unroll(disable)
45735
45740
for (c = 0; c < channels; c += 1) {
45736
45741
float r1 = pLPF->pR1[c].f32;
45737
- float x = pX[c];
45742
+ float x = pX[c];
45738
45743
float y;
45739
45744
45740
- y = b* x + a* r1;
45745
+ y = b * x + a * r1;
45741
45746
45742
- pY[c] = y;
45747
+ pY[c] = y;
45743
45748
pLPF->pR1[c].f32 = y;
45744
45749
}
45745
45750
}
@@ -45751,7 +45756,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45751
45756
const ma_int32 a = pLPF->a.s32;
45752
45757
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45753
45758
45754
- MA_ASSUME(channels > 0);
45759
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45760
+ #pragma clang loop unroll(disable)
45755
45761
for (c = 0; c < channels; c += 1) {
45756
45762
ma_int32 r1 = pLPF->pR1[c].s32;
45757
45763
ma_int32 x = pX[c];
@@ -46604,7 +46610,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46604
46610
const float a = 1 - pHPF->a.f32;
46605
46611
const float b = 1 - a;
46606
46612
46607
- MA_ASSUME(channels > 0 );
46613
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46608
46614
for (c = 0; c < channels; c += 1) {
46609
46615
float r1 = pHPF->pR1[c].f32;
46610
46616
float x = pX[c];
@@ -46624,7 +46630,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46624
46630
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46625
46631
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46626
46632
46627
- MA_ASSUME(channels > 0 );
46633
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46628
46634
for (c = 0; c < channels; c += 1) {
46629
46635
ma_int32 r1 = pHPF->pR1[c].s32;
46630
46636
ma_int32 x = pX[c];
@@ -48732,6 +48738,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48732
48738
ma_uint64 iFrame;
48733
48739
ma_uint32 iChannel;
48734
48740
ma_uint64 interpolatedFrameCount;
48741
+ const ma_uint32 channels = pGainer->config.channels;
48735
48742
48736
48743
MA_ASSERT(pGainer != NULL);
48737
48744
@@ -48771,12 +48778,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48771
48778
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48772
48779
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48773
48780
48774
- if (pGainer->config. channels <= 32) {
48781
+ if (channels <= 32) {
48775
48782
float pRunningGain[32];
48776
48783
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48777
48784
48778
48785
/* Initialize the running gain. */
48779
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48786
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48780
48787
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48781
48788
pRunningGainDelta[iChannel] = t * d;
48782
48789
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48785,7 +48792,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48785
48792
iFrame = 0;
48786
48793
48787
48794
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48788
- if (pGainer->config. channels == 2) {
48795
+ if (channels == 2) {
48789
48796
#if defined(MA_SUPPORT_SSE2)
48790
48797
if (ma_has_sse2()) {
48791
48798
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48833,6 +48840,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48833
48840
48834
48841
iFrame = unrolledLoopCount << 1;
48835
48842
#else
48843
+ #pragma clang loop vectorize(enable)
48836
48844
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48837
48845
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48838
48846
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48844,7 +48852,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48844
48852
}
48845
48853
#endif
48846
48854
}
48847
- } else if (pGainer->config. channels == 6) {
48855
+ } else if (channels == 6) {
48848
48856
#if defined(MA_SUPPORT_SSE2)
48849
48857
if (ma_has_sse2()) {
48850
48858
/*
@@ -48877,6 +48885,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48877
48885
} else
48878
48886
#endif
48879
48887
{
48888
+ #pragma clang loop vectorize(enable)
48880
48889
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48881
48890
for (iChannel = 0; iChannel < 6; iChannel += 1) {
48882
48891
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48888,7 +48897,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48888
48897
}
48889
48898
}
48890
48899
}
48891
- } else if (pGainer->config. channels == 8) {
48900
+ } else if (channels == 8) {
48892
48901
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48893
48902
#if defined(MA_SUPPORT_SSE2)
48894
48903
if (ma_has_sse2()) {
@@ -48908,6 +48917,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48908
48917
#endif
48909
48918
{
48910
48919
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48920
+ #pragma clang loop vectorize(enable)
48911
48921
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48912
48922
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48913
48923
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48921,17 +48931,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48921
48931
}
48922
48932
}
48923
48933
48934
+ #pragma clang loop unroll(disable)
48924
48935
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48925
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48926
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48936
+ #pragma clang loop vectorize(enable)
48937
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48938
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48927
48939
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48928
48940
}
48929
48941
}
48930
48942
} else {
48931
48943
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48944
+ #pragma clang loop unroll(disable)
48932
48945
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48933
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48934
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48946
+ #pragma clang loop vectorize(enable)
48947
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48948
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48935
48949
}
48936
48950
48937
48951
a += d;
@@ -48950,18 +48964,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48950
48964
48951
48965
/* All we need to do here is apply the new gains using an optimized path. */
48952
48966
if (pFramesOut != NULL && pFramesIn != NULL) {
48953
- if (pGainer->config. channels <= 32) {
48967
+ if (channels <= 32) {
48954
48968
float gains[32];
48955
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48969
+ #pragma clang loop unroll(disable)
48970
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48956
48971
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48957
48972
}
48958
48973
48959
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
48974
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
48960
48975
} else {
48961
48976
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48977
+ #pragma clang loop unroll(disable)
48962
48978
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48963
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48964
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48979
+ #pragma clang loop vectorize(enable)
48980
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48981
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48965
48982
}
48966
48983
}
48967
48984
}
@@ -51331,7 +51348,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51331
51348
51332
51349
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51333
51350
51334
- MA_ASSUME(channels > 0 );
51351
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51335
51352
for (c = 0; c < channels; c += 1) {
51336
51353
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51337
51354
pFrameOut[c] = s;
@@ -51350,7 +51367,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51350
51367
51351
51368
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51352
51369
51353
- MA_ASSUME(channels > 0 );
51370
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51354
51371
for (c = 0; c < channels; c += 1) {
51355
51372
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51356
51373
pFrameOut[c] = s;
@@ -52585,6 +52602,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
52585
52602
ma_uint64 iFrame;
52586
52603
ma_uint32 iChannelOut;
52587
52604
52605
+ #pragma clang loop unroll(disable)
52588
52606
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52589
52607
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52590
52608
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52605,6 +52623,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
52605
52623
ma_uint64 iFrame;
52606
52624
ma_uint32 iChannelOut;
52607
52625
52626
+ #pragma clang loop unroll(disable)
52608
52627
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52609
52628
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52610
52629
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52647,6 +52666,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
52647
52666
ma_uint64 iFrame;
52648
52667
ma_uint32 iChannelOut;
52649
52668
52669
+ #pragma clang loop unroll(disable)
52650
52670
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52651
52671
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52652
52672
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52667,6 +52687,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
52667
52687
ma_uint64 iFrame;
52668
52688
ma_uint32 iChannelOut;
52669
52689
52690
+ #pragma clang loop unroll(disable)
52670
52691
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52671
52692
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52672
52693
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52901,6 +52922,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52901
52922
} else
52902
52923
#endif
52903
52924
{
52925
+ #pragma clang loop vectorize(enable)
52904
52926
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52905
52927
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52906
52928
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52928,6 +52950,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52928
52950
} else
52929
52951
#endif
52930
52952
{
52953
+ #pragma clang loop vectorize(enable)
52931
52954
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52932
52955
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52933
52956
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52945,6 +52968,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52945
52968
} else
52946
52969
#endif
52947
52970
{
52971
+ #pragma clang loop vectorize(enable)
52948
52972
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52949
52973
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52950
52974
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66268,7 +66292,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66268
66292
ma_uint64 iFrame;
66269
66293
ma_uint32 iChannel;
66270
66294
const ma_uint32 channels = pNoise->config.channels;
66271
- MA_ASSUME(channels > 0 );
66295
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66272
66296
66273
66297
if (pNoise->config.format == ma_format_f32) {
66274
66298
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66387,7 +66411,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66387
66411
ma_uint64 iFrame;
66388
66412
ma_uint32 iChannel;
66389
66413
const ma_uint32 channels = pNoise->config.channels;
66390
- MA_ASSUME(channels > 0 );
66414
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66391
66415
66392
66416
if (pNoise->config.format == ma_format_f32) {
66393
66417
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66469,7 +66493,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66469
66493
ma_uint64 iFrame;
66470
66494
ma_uint32 iChannel;
66471
66495
const ma_uint32 channels = pNoise->config.channels;
66472
- MA_ASSUME(channels > 0 );
66496
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66473
66497
66474
66498
if (pNoise->config.format == ma_format_f32) {
66475
66499
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments