@@ -42797,7 +42797,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42797
42797
}
42798
42798
}
42799
42799
42800
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42800
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42801
42801
{
42802
42802
ma_uint64 iSample;
42803
42803
@@ -43092,10 +43092,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43092
43092
sampleCount = frameCount * channels;
43093
43093
43094
43094
if (volume == 1) {
43095
+ #pragma clang loop vectorize(enable)
43095
43096
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43096
43097
pDst[iSample] += pSrc[iSample];
43097
43098
}
43098
43099
} else {
43100
+ #pragma clang loop vectorize(enable)
43099
43101
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43100
43102
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43101
43103
}
@@ -45398,7 +45400,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45398
45400
const float a1 = pBQ->a1.f32;
45399
45401
const float a2 = pBQ->a2.f32;
45400
45402
45401
- MA_ASSUME(channels > 0);
45403
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45404
+ #pragma clang loop vectorize(assume_safety)
45402
45405
for (c = 0; c < channels; c += 1) {
45403
45406
float r1 = pBQ->pR1[c].f32;
45404
45407
float r2 = pBQ->pR2[c].f32;
@@ -45430,7 +45433,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45430
45433
const ma_int32 a1 = pBQ->a1.s32;
45431
45434
const ma_int32 a2 = pBQ->a2.s32;
45432
45435
45433
- MA_ASSUME(channels > 0);
45436
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45437
+ #pragma clang loop vectorize(assume_safety)
45434
45438
for (c = 0; c < channels; c += 1) {
45435
45439
ma_int32 r1 = pBQ->pR1[c].s32;
45436
45440
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45704,22 +45708,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45704
45708
return MA_SUCCESS;
45705
45709
}
45706
45710
45707
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45711
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45708
45712
{
45709
45713
ma_uint32 c;
45710
45714
const ma_uint32 channels = pLPF->channels;
45711
45715
const float a = pLPF->a.f32;
45712
45716
const float b = 1 - a;
45713
45717
45714
- MA_ASSUME(channels > 0);
45718
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45719
+ #pragma clang loop vectorize(assume_safety)
45715
45720
for (c = 0; c < channels; c += 1) {
45716
45721
float r1 = pLPF->pR1[c].f32;
45717
- float x = pX[c];
45722
+ float x = pX[c];
45718
45723
float y;
45719
45724
45720
- y = b* x + a* r1;
45725
+ y = b * x + a * r1;
45721
45726
45722
- pY[c] = y;
45727
+ pY[c] = y;
45723
45728
pLPF->pR1[c].f32 = y;
45724
45729
}
45725
45730
}
@@ -45731,7 +45736,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45731
45736
const ma_int32 a = pLPF->a.s32;
45732
45737
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45733
45738
45734
- MA_ASSUME(channels > 0);
45739
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45740
+ #pragma clang loop vectorize(assume_safety)
45735
45741
for (c = 0; c < channels; c += 1) {
45736
45742
ma_int32 r1 = pLPF->pR1[c].s32;
45737
45743
ma_int32 x = pX[c];
@@ -46584,7 +46590,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46584
46590
const float a = 1 - pHPF->a.f32;
46585
46591
const float b = 1 - a;
46586
46592
46587
- MA_ASSUME(channels > 0 );
46593
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46588
46594
for (c = 0; c < channels; c += 1) {
46589
46595
float r1 = pHPF->pR1[c].f32;
46590
46596
float x = pX[c];
@@ -46604,7 +46610,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46604
46610
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46605
46611
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46606
46612
46607
- MA_ASSUME(channels > 0 );
46613
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46608
46614
for (c = 0; c < channels; c += 1) {
46609
46615
ma_int32 r1 = pHPF->pR1[c].s32;
46610
46616
ma_int32 x = pX[c];
@@ -48712,6 +48718,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48712
48718
ma_uint64 iFrame;
48713
48719
ma_uint32 iChannel;
48714
48720
ma_uint64 interpolatedFrameCount;
48721
+ const ma_uint32 channels = pGainer->config.channels;
48715
48722
48716
48723
MA_ASSERT(pGainer != NULL);
48717
48724
@@ -48751,12 +48758,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48751
48758
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48752
48759
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48753
48760
48754
- if (pGainer->config. channels <= 32) {
48761
+ if (channels <= 32) {
48755
48762
float pRunningGain[32];
48756
48763
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48757
48764
48758
48765
/* Initialize the running gain. */
48759
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48766
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48760
48767
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48761
48768
pRunningGainDelta[iChannel] = t * d;
48762
48769
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48765,7 +48772,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48765
48772
iFrame = 0;
48766
48773
48767
48774
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48768
- if (pGainer->config. channels == 2) {
48775
+ if (channels == 2) {
48769
48776
#if defined(MA_SUPPORT_SSE2)
48770
48777
if (ma_has_sse2()) {
48771
48778
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48813,6 +48820,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48813
48820
48814
48821
iFrame = unrolledLoopCount << 1;
48815
48822
#else
48823
+ #pragma clang loop vectorize(enable)
48816
48824
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48817
48825
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48818
48826
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48824,7 +48832,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48824
48832
}
48825
48833
#endif
48826
48834
}
48827
- } else if (pGainer->config. channels == 6) {
48835
+ } else if (channels == 6) {
48828
48836
#if defined(MA_SUPPORT_SSE2)
48829
48837
if (ma_has_sse2()) {
48830
48838
/*
@@ -48868,7 +48876,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48868
48876
}
48869
48877
}
48870
48878
}
48871
- } else if (pGainer->config. channels == 8) {
48879
+ } else if (channels == 8) {
48872
48880
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48873
48881
#if defined(MA_SUPPORT_SSE2)
48874
48882
if (ma_has_sse2()) {
@@ -48889,29 +48897,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48889
48897
{
48890
48898
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48891
48899
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48900
+ #pragma clang loop vectorize(enable)
48892
48901
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48893
48902
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
48894
48903
}
48895
48904
48896
48905
/* Move the running gain forward towards the new gain. */
48906
+ #pragma clang loop vectorize(enable)
48897
48907
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48898
48908
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48899
48909
}
48900
48910
}
48901
48911
}
48902
48912
}
48903
48913
48914
+ #pragma clang loop unroll(disable)
48904
48915
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48905
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48906
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48916
+ #pragma clang loop vectorize(enable)
48917
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48918
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48907
48919
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48908
48920
}
48909
48921
}
48910
48922
} else {
48911
48923
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48924
+ #pragma clang loop unroll(disable)
48912
48925
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48913
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48914
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48926
+ #pragma clang loop vectorize(enable)
48927
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48928
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48915
48929
}
48916
48930
48917
48931
a += d;
@@ -48930,18 +48944,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48930
48944
48931
48945
/* All we need to do here is apply the new gains using an optimized path. */
48932
48946
if (pFramesOut != NULL && pFramesIn != NULL) {
48933
- if (pGainer->config. channels <= 32) {
48947
+ if (channels <= 32) {
48934
48948
float gains[32];
48935
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48949
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48936
48950
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48937
48951
}
48938
48952
48939
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
48953
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
48940
48954
} else {
48941
48955
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48956
+ #pragma clang loop unroll(disable)
48942
48957
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48943
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48944
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48958
+ #pragma clang loop vectorize(enable)
48959
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48960
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48945
48961
}
48946
48962
}
48947
48963
}
@@ -51343,7 +51359,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51343
51359
51344
51360
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51345
51361
51346
- MA_ASSUME(channels > 0 );
51362
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51347
51363
for (c = 0; c < channels; c += 1) {
51348
51364
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51349
51365
pFrameOut[c] = s;
@@ -51362,7 +51378,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51362
51378
51363
51379
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51364
51380
51365
- MA_ASSUME(channels > 0 );
51381
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51366
51382
for (c = 0; c < channels; c += 1) {
51367
51383
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51368
51384
pFrameOut[c] = s;
@@ -51533,7 +51549,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
51533
51549
}
51534
51550
51535
51551
51536
- static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51552
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51537
51553
{
51538
51554
const float* pFramesInF32;
51539
51555
/* */ float* pFramesOutF32;
@@ -51609,7 +51625,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
51609
51625
return MA_SUCCESS;
51610
51626
}
51611
51627
51612
- static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51628
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51613
51629
{
51614
51630
const float* pFramesInF32;
51615
51631
/* */ float* pFramesOutF32;
@@ -52922,6 +52938,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52922
52938
#endif
52923
52939
{
52924
52940
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52941
+ #pragma clang loop vectorize(enable)
52925
52942
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52926
52943
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
52927
52944
}
@@ -52949,6 +52966,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52949
52966
#endif
52950
52967
{
52951
52968
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52969
+ #pragma clang loop vectorize(enable)
52952
52970
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52953
52971
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
52954
52972
}
@@ -52966,6 +52984,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52966
52984
#endif
52967
52985
{
52968
52986
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52987
+ #pragma clang loop vectorize(enable)
52969
52988
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52970
52989
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
52971
52990
}
@@ -66711,7 +66730,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66711
66730
ma_uint64 iFrame;
66712
66731
ma_uint32 iChannel;
66713
66732
const ma_uint32 channels = pNoise->config.channels;
66714
- MA_ASSUME(channels > 0 );
66733
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66715
66734
66716
66735
if (pNoise->config.format == ma_format_f32) {
66717
66736
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66830,7 +66849,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66830
66849
ma_uint64 iFrame;
66831
66850
ma_uint32 iChannel;
66832
66851
const ma_uint32 channels = pNoise->config.channels;
66833
- MA_ASSUME(channels > 0 );
66852
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66834
66853
66835
66854
if (pNoise->config.format == ma_format_f32) {
66836
66855
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66912,7 +66931,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66912
66931
ma_uint64 iFrame;
66913
66932
ma_uint32 iChannel;
66914
66933
const ma_uint32 channels = pNoise->config.channels;
66915
- MA_ASSUME(channels > 0 );
66934
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66916
66935
66917
66936
if (pNoise->config.format == ma_format_f32) {
66918
66937
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments