@@ -42913,7 +42913,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42913
42913
}
42914
42914
}
42915
42915
42916
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42916
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42917
42917
{
42918
42918
ma_uint64 iSample;
42919
42919
@@ -43208,10 +43208,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43208
43208
sampleCount = frameCount * channels;
43209
43209
43210
43210
if (volume == 1) {
43211
+ #pragma clang loop vectorize(enable)
43211
43212
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43212
43213
pDst[iSample] += pSrc[iSample];
43213
43214
}
43214
43215
} else {
43216
+ #pragma clang loop vectorize(enable)
43215
43217
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43216
43218
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43217
43219
}
@@ -45514,7 +45516,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45514
45516
const float a1 = pBQ->a1.f32;
45515
45517
const float a2 = pBQ->a2.f32;
45516
45518
45517
- MA_ASSUME(channels > 0);
45519
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45520
+ #pragma clang loop vectorize(assume_safety)
45518
45521
for (c = 0; c < channels; c += 1) {
45519
45522
float r1 = pBQ->pR1[c].f32;
45520
45523
float r2 = pBQ->pR2[c].f32;
@@ -45546,7 +45549,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45546
45549
const ma_int32 a1 = pBQ->a1.s32;
45547
45550
const ma_int32 a2 = pBQ->a2.s32;
45548
45551
45549
- MA_ASSUME(channels > 0);
45552
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45553
+ #pragma clang loop vectorize(assume_safety)
45550
45554
for (c = 0; c < channels; c += 1) {
45551
45555
ma_int32 r1 = pBQ->pR1[c].s32;
45552
45556
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45820,22 +45824,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45820
45824
return MA_SUCCESS;
45821
45825
}
45822
45826
45823
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45827
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45824
45828
{
45825
45829
ma_uint32 c;
45826
45830
const ma_uint32 channels = pLPF->channels;
45827
45831
const float a = pLPF->a.f32;
45828
45832
const float b = 1 - a;
45829
45833
45830
- MA_ASSUME(channels > 0);
45834
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45835
+ #pragma clang loop vectorize(assume_safety)
45831
45836
for (c = 0; c < channels; c += 1) {
45832
45837
float r1 = pLPF->pR1[c].f32;
45833
- float x = pX[c];
45838
+ float x = pX[c];
45834
45839
float y;
45835
45840
45836
- y = b* x + a* r1;
45841
+ y = b * x + a * r1;
45837
45842
45838
- pY[c] = y;
45843
+ pY[c] = y;
45839
45844
pLPF->pR1[c].f32 = y;
45840
45845
}
45841
45846
}
@@ -45847,7 +45852,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45847
45852
const ma_int32 a = pLPF->a.s32;
45848
45853
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45849
45854
45850
- MA_ASSUME(channels > 0);
45855
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45856
+ #pragma clang loop vectorize(assume_safety)
45851
45857
for (c = 0; c < channels; c += 1) {
45852
45858
ma_int32 r1 = pLPF->pR1[c].s32;
45853
45859
ma_int32 x = pX[c];
@@ -46700,7 +46706,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46700
46706
const float a = 1 - pHPF->a.f32;
46701
46707
const float b = 1 - a;
46702
46708
46703
- MA_ASSUME(channels > 0 );
46709
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46704
46710
for (c = 0; c < channels; c += 1) {
46705
46711
float r1 = pHPF->pR1[c].f32;
46706
46712
float x = pX[c];
@@ -46720,7 +46726,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46720
46726
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46721
46727
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46722
46728
46723
- MA_ASSUME(channels > 0 );
46729
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46724
46730
for (c = 0; c < channels; c += 1) {
46725
46731
ma_int32 r1 = pHPF->pR1[c].s32;
46726
46732
ma_int32 x = pX[c];
@@ -48828,6 +48834,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48828
48834
ma_uint64 iFrame;
48829
48835
ma_uint32 iChannel;
48830
48836
ma_uint64 interpolatedFrameCount;
48837
+ const ma_uint32 channels = pGainer->config.channels;
48831
48838
48832
48839
MA_ASSERT(pGainer != NULL);
48833
48840
@@ -48867,12 +48874,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48867
48874
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48868
48875
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48869
48876
48870
- if (pGainer->config. channels <= 32) {
48877
+ if (channels <= 32) {
48871
48878
float pRunningGain[32];
48872
48879
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48873
48880
48874
48881
/* Initialize the running gain. */
48875
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48882
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48876
48883
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48877
48884
pRunningGainDelta[iChannel] = t * d;
48878
48885
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48881,7 +48888,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48881
48888
iFrame = 0;
48882
48889
48883
48890
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48884
- if (pGainer->config. channels == 2) {
48891
+ if (channels == 2) {
48885
48892
#if defined(MA_SUPPORT_SSE2)
48886
48893
if (ma_has_sse2()) {
48887
48894
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48929,6 +48936,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48929
48936
48930
48937
iFrame = unrolledLoopCount << 1;
48931
48938
#else
48939
+ #pragma clang loop vectorize(enable)
48932
48940
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48933
48941
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48934
48942
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48940,7 +48948,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48940
48948
}
48941
48949
#endif
48942
48950
}
48943
- } else if (pGainer->config. channels == 6) {
48951
+ } else if (channels == 6) {
48944
48952
#if defined(MA_SUPPORT_SSE2)
48945
48953
if (ma_has_sse2()) {
48946
48954
/*
@@ -48984,7 +48992,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48984
48992
}
48985
48993
}
48986
48994
}
48987
- } else if (pGainer->config. channels == 8) {
48995
+ } else if (channels == 8) {
48988
48996
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48989
48997
#if defined(MA_SUPPORT_SSE2)
48990
48998
if (ma_has_sse2()) {
@@ -49005,29 +49013,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
49005
49013
{
49006
49014
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
49007
49015
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
49016
+ #pragma clang loop vectorize(enable)
49008
49017
for (iChannel = 0; iChannel < 8; iChannel += 1) {
49009
49018
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
49010
49019
}
49011
49020
49012
49021
/* Move the running gain forward towards the new gain. */
49022
+ #pragma clang loop vectorize(enable)
49013
49023
for (iChannel = 0; iChannel < 8; iChannel += 1) {
49014
49024
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
49015
49025
}
49016
49026
}
49017
49027
}
49018
49028
}
49019
49029
49030
+ #pragma clang loop unroll(disable)
49020
49031
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
49021
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49022
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
49032
+ #pragma clang loop vectorize(enable)
49033
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49034
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
49023
49035
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
49024
49036
}
49025
49037
}
49026
49038
} else {
49027
49039
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
49040
+ #pragma clang loop unroll(disable)
49028
49041
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
49029
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49030
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49042
+ #pragma clang loop vectorize(enable)
49043
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49044
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
49031
49045
}
49032
49046
49033
49047
a += d;
@@ -49046,18 +49060,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
49046
49060
49047
49061
/* All we need to do here is apply the new gains using an optimized path. */
49048
49062
if (pFramesOut != NULL && pFramesIn != NULL) {
49049
- if (pGainer->config. channels <= 32) {
49063
+ if (channels <= 32) {
49050
49064
float gains[32];
49051
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
49065
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49052
49066
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49053
49067
}
49054
49068
49055
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
49069
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
49056
49070
} else {
49057
49071
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49072
+ #pragma clang loop unroll(disable)
49058
49073
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49059
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49060
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49074
+ #pragma clang loop vectorize(enable)
49075
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49076
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49061
49077
}
49062
49078
}
49063
49079
}
@@ -51459,7 +51475,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51459
51475
51460
51476
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51461
51477
51462
- MA_ASSUME(channels > 0 );
51478
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51463
51479
for (c = 0; c < channels; c += 1) {
51464
51480
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51465
51481
pFrameOut[c] = s;
@@ -51478,7 +51494,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51478
51494
51479
51495
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51480
51496
51481
- MA_ASSUME(channels > 0 );
51497
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51482
51498
for (c = 0; c < channels; c += 1) {
51483
51499
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51484
51500
pFrameOut[c] = s;
@@ -51649,7 +51665,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
51649
51665
}
51650
51666
51651
51667
51652
- static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51668
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51653
51669
{
51654
51670
const float* pFramesInF32;
51655
51671
/* */ float* pFramesOutF32;
@@ -51725,7 +51741,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
51725
51741
return MA_SUCCESS;
51726
51742
}
51727
51743
51728
- static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51744
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51729
51745
{
51730
51746
const float* pFramesInF32;
51731
51747
/* */ float* pFramesOutF32;
@@ -53038,6 +53054,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53038
53054
#endif
53039
53055
{
53040
53056
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53057
+ #pragma clang loop vectorize(enable)
53041
53058
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
53042
53059
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
53043
53060
}
@@ -53065,6 +53082,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53065
53082
#endif
53066
53083
{
53067
53084
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53085
+ #pragma clang loop vectorize(enable)
53068
53086
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
53069
53087
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
53070
53088
}
@@ -53082,6 +53100,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
53082
53100
#endif
53083
53101
{
53084
53102
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
53103
+ #pragma clang loop vectorize(enable)
53085
53104
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
53086
53105
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
53087
53106
}
@@ -66827,7 +66846,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66827
66846
ma_uint64 iFrame;
66828
66847
ma_uint32 iChannel;
66829
66848
const ma_uint32 channels = pNoise->config.channels;
66830
- MA_ASSUME(channels > 0 );
66849
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66831
66850
66832
66851
if (pNoise->config.format == ma_format_f32) {
66833
66852
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66946,7 +66965,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66946
66965
ma_uint64 iFrame;
66947
66966
ma_uint32 iChannel;
66948
66967
const ma_uint32 channels = pNoise->config.channels;
66949
- MA_ASSUME(channels > 0 );
66968
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66950
66969
66951
66970
if (pNoise->config.format == ma_format_f32) {
66952
66971
float* pFramesOutF32 = (float*)pFramesOut;
@@ -67028,7 +67047,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
67028
67047
ma_uint64 iFrame;
67029
67048
ma_uint32 iChannel;
67030
67049
const ma_uint32 channels = pNoise->config.channels;
67031
- MA_ASSUME(channels > 0 );
67050
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
67032
67051
67033
67052
if (pNoise->config.format == ma_format_f32) {
67034
67053
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments