@@ -42822,7 +42822,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42822
42822
}
42823
42823
}
42824
42824
42825
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42825
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42826
42826
{
42827
42827
ma_uint64 iSample;
42828
42828
@@ -43117,10 +43117,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43117
43117
sampleCount = frameCount * channels;
43118
43118
43119
43119
if (volume == 1) {
43120
+ #pragma clang loop vectorize(enable)
43120
43121
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43121
43122
pDst[iSample] += pSrc[iSample];
43122
43123
}
43123
43124
} else {
43125
+ #pragma clang loop vectorize(enable)
43124
43126
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43125
43127
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43126
43128
}
@@ -45423,7 +45425,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45423
45425
const float a1 = pBQ->a1.f32;
45424
45426
const float a2 = pBQ->a2.f32;
45425
45427
45426
- MA_ASSUME(channels > 0);
45428
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45429
+ #pragma clang loop vectorize(assume_safety)
45427
45430
for (c = 0; c < channels; c += 1) {
45428
45431
float r1 = pBQ->pR1[c].f32;
45429
45432
float r2 = pBQ->pR2[c].f32;
@@ -45455,7 +45458,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45455
45458
const ma_int32 a1 = pBQ->a1.s32;
45456
45459
const ma_int32 a2 = pBQ->a2.s32;
45457
45460
45458
- MA_ASSUME(channels > 0);
45461
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45462
+ #pragma clang loop vectorize(assume_safety)
45459
45463
for (c = 0; c < channels; c += 1) {
45460
45464
ma_int32 r1 = pBQ->pR1[c].s32;
45461
45465
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45729,22 +45733,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45729
45733
return MA_SUCCESS;
45730
45734
}
45731
45735
45732
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45736
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45733
45737
{
45734
45738
ma_uint32 c;
45735
45739
const ma_uint32 channels = pLPF->channels;
45736
45740
const float a = pLPF->a.f32;
45737
45741
const float b = 1 - a;
45738
45742
45739
- MA_ASSUME(channels > 0);
45743
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45744
+ #pragma clang loop vectorize(assume_safety)
45740
45745
for (c = 0; c < channels; c += 1) {
45741
45746
float r1 = pLPF->pR1[c].f32;
45742
- float x = pX[c];
45747
+ float x = pX[c];
45743
45748
float y;
45744
45749
45745
- y = b* x + a* r1;
45750
+ y = b * x + a * r1;
45746
45751
45747
- pY[c] = y;
45752
+ pY[c] = y;
45748
45753
pLPF->pR1[c].f32 = y;
45749
45754
}
45750
45755
}
@@ -45756,7 +45761,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45756
45761
const ma_int32 a = pLPF->a.s32;
45757
45762
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45758
45763
45759
- MA_ASSUME(channels > 0);
45764
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45765
+ #pragma clang loop vectorize(assume_safety)
45760
45766
for (c = 0; c < channels; c += 1) {
45761
45767
ma_int32 r1 = pLPF->pR1[c].s32;
45762
45768
ma_int32 x = pX[c];
@@ -46609,7 +46615,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46609
46615
const float a = 1 - pHPF->a.f32;
46610
46616
const float b = 1 - a;
46611
46617
46612
- MA_ASSUME(channels > 0 );
46618
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46613
46619
for (c = 0; c < channels; c += 1) {
46614
46620
float r1 = pHPF->pR1[c].f32;
46615
46621
float x = pX[c];
@@ -46629,7 +46635,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46629
46635
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46630
46636
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46631
46637
46632
- MA_ASSUME(channels > 0 );
46638
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46633
46639
for (c = 0; c < channels; c += 1) {
46634
46640
ma_int32 r1 = pHPF->pR1[c].s32;
46635
46641
ma_int32 x = pX[c];
@@ -48737,6 +48743,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48737
48743
ma_uint64 iFrame;
48738
48744
ma_uint32 iChannel;
48739
48745
ma_uint64 interpolatedFrameCount;
48746
+ const ma_uint32 channels = pGainer->config.channels;
48740
48747
48741
48748
MA_ASSERT(pGainer != NULL);
48742
48749
@@ -48776,12 +48783,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48776
48783
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48777
48784
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48778
48785
48779
- if (pGainer->config. channels <= 32) {
48786
+ if (channels <= 32) {
48780
48787
float pRunningGain[32];
48781
48788
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48782
48789
48783
48790
/* Initialize the running gain. */
48784
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48791
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48785
48792
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48786
48793
pRunningGainDelta[iChannel] = t * d;
48787
48794
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48790,7 +48797,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48790
48797
iFrame = 0;
48791
48798
48792
48799
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48793
- if (pGainer->config. channels == 2) {
48800
+ if (channels == 2) {
48794
48801
#if defined(MA_SUPPORT_SSE2)
48795
48802
if (ma_has_sse2()) {
48796
48803
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48838,6 +48845,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48838
48845
48839
48846
iFrame = unrolledLoopCount << 1;
48840
48847
#else
48848
+ #pragma clang loop vectorize(enable)
48841
48849
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48842
48850
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48843
48851
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48849,7 +48857,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48849
48857
}
48850
48858
#endif
48851
48859
}
48852
- } else if (pGainer->config. channels == 6) {
48860
+ } else if (channels == 6) {
48853
48861
#if defined(MA_SUPPORT_SSE2)
48854
48862
if (ma_has_sse2()) {
48855
48863
/*
@@ -48893,7 +48901,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48893
48901
}
48894
48902
}
48895
48903
}
48896
- } else if (pGainer->config. channels == 8) {
48904
+ } else if (channels == 8) {
48897
48905
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48898
48906
#if defined(MA_SUPPORT_SSE2)
48899
48907
if (ma_has_sse2()) {
@@ -48914,29 +48922,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48914
48922
{
48915
48923
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48916
48924
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48925
+ #pragma clang loop vectorize(enable)
48917
48926
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48918
48927
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
48919
48928
}
48920
48929
48921
48930
/* Move the running gain forward towards the new gain. */
48931
+ #pragma clang loop vectorize(enable)
48922
48932
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48923
48933
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48924
48934
}
48925
48935
}
48926
48936
}
48927
48937
}
48928
48938
48939
+ #pragma clang loop unroll(disable)
48929
48940
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48930
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48931
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48941
+ #pragma clang loop vectorize(enable)
48942
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48943
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48932
48944
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48933
48945
}
48934
48946
}
48935
48947
} else {
48936
48948
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48949
+ #pragma clang loop unroll(disable)
48937
48950
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48938
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48939
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48951
+ #pragma clang loop vectorize(enable)
48952
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48953
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48940
48954
}
48941
48955
48942
48956
a += d;
@@ -48955,18 +48969,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48955
48969
48956
48970
/* All we need to do here is apply the new gains using an optimized path. */
48957
48971
if (pFramesOut != NULL && pFramesIn != NULL) {
48958
- if (pGainer->config. channels <= 32) {
48972
+ if (channels <= 32) {
48959
48973
float gains[32];
48960
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48974
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48961
48975
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48962
48976
}
48963
48977
48964
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
48978
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
48965
48979
} else {
48966
48980
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48981
+ #pragma clang loop unroll(disable)
48967
48982
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48968
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48969
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48983
+ #pragma clang loop vectorize(enable)
48984
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48985
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48970
48986
}
48971
48987
}
48972
48988
}
@@ -51368,7 +51384,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51368
51384
51369
51385
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51370
51386
51371
- MA_ASSUME(channels > 0 );
51387
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51372
51388
for (c = 0; c < channels; c += 1) {
51373
51389
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51374
51390
pFrameOut[c] = s;
@@ -51387,7 +51403,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51387
51403
51388
51404
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51389
51405
51390
- MA_ASSUME(channels > 0 );
51406
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51391
51407
for (c = 0; c < channels; c += 1) {
51392
51408
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51393
51409
pFrameOut[c] = s;
@@ -51558,7 +51574,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
51558
51574
}
51559
51575
51560
51576
51561
- static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51577
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51562
51578
{
51563
51579
const float* pFramesInF32;
51564
51580
/* */ float* pFramesOutF32;
@@ -51634,7 +51650,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
51634
51650
return MA_SUCCESS;
51635
51651
}
51636
51652
51637
- static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51653
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51638
51654
{
51639
51655
const float* pFramesInF32;
51640
51656
/* */ float* pFramesOutF32;
@@ -52947,6 +52963,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52947
52963
#endif
52948
52964
{
52949
52965
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52966
+ #pragma clang loop vectorize(enable)
52950
52967
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52951
52968
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
52952
52969
}
@@ -52974,6 +52991,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52974
52991
#endif
52975
52992
{
52976
52993
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52994
+ #pragma clang loop vectorize(enable)
52977
52995
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52978
52996
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
52979
52997
}
@@ -52991,6 +53009,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52991
53009
#endif
52992
53010
{
52993
53011
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53012
+ #pragma clang loop vectorize(enable)
52994
53013
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52995
53014
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
52996
53015
}
@@ -66736,7 +66755,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66736
66755
ma_uint64 iFrame;
66737
66756
ma_uint32 iChannel;
66738
66757
const ma_uint32 channels = pNoise->config.channels;
66739
- MA_ASSUME(channels > 0 );
66758
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66740
66759
66741
66760
if (pNoise->config.format == ma_format_f32) {
66742
66761
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66855,7 +66874,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66855
66874
ma_uint64 iFrame;
66856
66875
ma_uint32 iChannel;
66857
66876
const ma_uint32 channels = pNoise->config.channels;
66858
- MA_ASSUME(channels > 0 );
66877
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66859
66878
66860
66879
if (pNoise->config.format == ma_format_f32) {
66861
66880
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66937,7 +66956,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66937
66956
ma_uint64 iFrame;
66938
66957
ma_uint32 iChannel;
66939
66958
const ma_uint32 channels = pNoise->config.channels;
66940
- MA_ASSUME(channels > 0 );
66959
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66941
66960
66942
66961
if (pNoise->config.format == ma_format_f32) {
66943
66962
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments