@@ -42017,7 +42017,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42017
42017
}
42018
42018
}
42019
42019
42020
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42020
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42021
42021
{
42022
42022
ma_uint64 iSample;
42023
42023
@@ -44587,7 +44587,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
44587
44587
const float a1 = pBQ->a1.f32;
44588
44588
const float a2 = pBQ->a2.f32;
44589
44589
44590
- MA_ASSUME(channels > 0);
44590
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44591
+ #pragma clang loop unroll(disable)
44591
44592
for (c = 0; c < channels; c += 1) {
44592
44593
float r1 = pBQ->pR1[c].f32;
44593
44594
float r2 = pBQ->pR2[c].f32;
@@ -44619,7 +44620,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
44619
44620
const ma_int32 a1 = pBQ->a1.s32;
44620
44621
const ma_int32 a2 = pBQ->a2.s32;
44621
44622
44622
- MA_ASSUME(channels > 0);
44623
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44624
+ #pragma clang loop unroll(disable)
44623
44625
for (c = 0; c < channels; c += 1) {
44624
44626
ma_int32 r1 = pBQ->pR1[c].s32;
44625
44627
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -44893,22 +44895,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
44893
44895
return MA_SUCCESS;
44894
44896
}
44895
44897
44896
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
44898
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
44897
44899
{
44898
44900
ma_uint32 c;
44899
44901
const ma_uint32 channels = pLPF->channels;
44900
44902
const float a = pLPF->a.f32;
44901
44903
const float b = 1 - a;
44902
44904
44903
- MA_ASSUME(channels > 0);
44905
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44906
+ #pragma clang loop unroll(disable)
44904
44907
for (c = 0; c < channels; c += 1) {
44905
44908
float r1 = pLPF->pR1[c].f32;
44906
- float x = pX[c];
44909
+ float x = pX[c];
44907
44910
float y;
44908
44911
44909
- y = b* x + a* r1;
44912
+ y = b * x + a * r1;
44910
44913
44911
- pY[c] = y;
44914
+ pY[c] = y;
44912
44915
pLPF->pR1[c].f32 = y;
44913
44916
}
44914
44917
}
@@ -44920,7 +44923,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
44920
44923
const ma_int32 a = pLPF->a.s32;
44921
44924
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
44922
44925
44923
- MA_ASSUME(channels > 0);
44926
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44927
+ #pragma clang loop unroll(disable)
44924
44928
for (c = 0; c < channels; c += 1) {
44925
44929
ma_int32 r1 = pLPF->pR1[c].s32;
44926
44930
ma_int32 x = pX[c];
@@ -45773,7 +45777,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
45773
45777
const float a = 1 - pHPF->a.f32;
45774
45778
const float b = 1 - a;
45775
45779
45776
- MA_ASSUME(channels > 0 );
45780
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
45777
45781
for (c = 0; c < channels; c += 1) {
45778
45782
float r1 = pHPF->pR1[c].f32;
45779
45783
float x = pX[c];
@@ -45793,7 +45797,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
45793
45797
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
45794
45798
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45795
45799
45796
- MA_ASSUME(channels > 0 );
45800
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
45797
45801
for (c = 0; c < channels; c += 1) {
45798
45802
ma_int32 r1 = pHPF->pR1[c].s32;
45799
45803
ma_int32 x = pX[c];
@@ -47901,6 +47905,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
47901
47905
ma_uint64 iFrame;
47902
47906
ma_uint32 iChannel;
47903
47907
ma_uint64 interpolatedFrameCount;
47908
+ const ma_uint32 channels = pGainer->config.channels;
47904
47909
47905
47910
MA_ASSERT(pGainer != NULL);
47906
47911
@@ -47940,12 +47945,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
47940
47945
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
47941
47946
float d = 1.0f / pGainer->config.smoothTimeInFrames;
47942
47947
47943
- if (pGainer->config. channels <= 32) {
47948
+ if (channels <= 32) {
47944
47949
float pRunningGain[32];
47945
47950
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
47946
47951
47947
47952
/* Initialize the running gain. */
47948
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
47953
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
47949
47954
float t = (pGainer->pOldGains[iChannel] - pGainer->pNewGains[iChannel]) * pGainer->masterVolume;
47950
47955
pRunningGainDelta[iChannel] = t * d;
47951
47956
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -47954,7 +47959,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
47954
47959
iFrame = 0;
47955
47960
47956
47961
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
47957
- if (pGainer->config. channels == 2) {
47962
+ if (channels == 2) {
47958
47963
#if defined(MA_SUPPORT_SSE2)
47959
47964
if (ma_has_sse2()) {
47960
47965
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48002,6 +48007,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48002
48007
48003
48008
iFrame = unrolledLoopCount << 1;
48004
48009
#else
48010
+ #pragma clang loop vectorize(enable)
48005
48011
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48006
48012
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48007
48013
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48013,7 +48019,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48013
48019
}
48014
48020
#endif
48015
48021
}
48016
- } else if (pGainer->config. channels == 6) {
48022
+ } else if (channels == 6) {
48017
48023
#if defined(MA_SUPPORT_SSE2)
48018
48024
if (ma_has_sse2()) {
48019
48025
/*
@@ -48046,6 +48052,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48046
48052
} else
48047
48053
#endif
48048
48054
{
48055
+ #pragma clang loop vectorize(enable)
48049
48056
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48050
48057
for (iChannel = 0; iChannel < 6; iChannel += 1) {
48051
48058
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48057,7 +48064,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48057
48064
}
48058
48065
}
48059
48066
}
48060
- } else if (pGainer->config. channels == 8) {
48067
+ } else if (channels == 8) {
48061
48068
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48062
48069
#if defined(MA_SUPPORT_SSE2)
48063
48070
if (ma_has_sse2()) {
@@ -48077,6 +48084,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48077
48084
#endif
48078
48085
{
48079
48086
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48087
+ #pragma clang loop vectorize(enable)
48080
48088
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48081
48089
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48082
48090
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48090,17 +48098,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48090
48098
}
48091
48099
}
48092
48100
48101
+ #pragma clang loop unroll(disable)
48093
48102
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48094
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48095
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48103
+ #pragma clang loop vectorize(enable)
48104
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48105
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48096
48106
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48097
48107
}
48098
48108
}
48099
48109
} else {
48100
48110
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48111
+ #pragma clang loop unroll(disable)
48101
48112
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48102
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48103
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48113
+ #pragma clang loop vectorize(enable)
48114
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48115
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48104
48116
}
48105
48117
48106
48118
a += d;
@@ -48119,18 +48131,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48119
48131
48120
48132
/* All we need to do here is apply the new gains using an optimized path. */
48121
48133
if (pFramesOut != NULL && pFramesIn != NULL) {
48122
- if (pGainer->config. channels <= 32) {
48134
+ if (channels <= 32) {
48123
48135
float gains[32];
48124
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48136
+ #pragma clang loop unroll(disable)
48137
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48125
48138
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48126
48139
}
48127
48140
48128
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
48141
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
48129
48142
} else {
48130
48143
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48144
+ #pragma clang loop unroll(disable)
48131
48145
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48132
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48133
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48146
+ #pragma clang loop vectorize(enable)
48147
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48148
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48134
48149
}
48135
48150
}
48136
48151
}
@@ -50491,7 +50506,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
50491
50506
50492
50507
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
50493
50508
50494
- MA_ASSUME(channels > 0 );
50509
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
50495
50510
for (c = 0; c < channels; c += 1) {
50496
50511
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
50497
50512
pFrameOut[c] = s;
@@ -50510,7 +50525,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
50510
50525
50511
50526
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
50512
50527
50513
- MA_ASSUME(channels > 0 );
50528
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
50514
50529
for (c = 0; c < channels; c += 1) {
50515
50530
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
50516
50531
pFrameOut[c] = s;
@@ -51745,6 +51760,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
51745
51760
ma_uint64 iFrame;
51746
51761
ma_uint32 iChannelOut;
51747
51762
51763
+ #pragma clang loop unroll(disable)
51748
51764
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51749
51765
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51750
51766
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51765,6 +51781,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
51765
51781
ma_uint64 iFrame;
51766
51782
ma_uint32 iChannelOut;
51767
51783
51784
+ #pragma clang loop unroll(disable)
51768
51785
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51769
51786
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51770
51787
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51807,6 +51824,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
51807
51824
ma_uint64 iFrame;
51808
51825
ma_uint32 iChannelOut;
51809
51826
51827
+ #pragma clang loop unroll(disable)
51810
51828
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51811
51829
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51812
51830
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51827,6 +51845,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
51827
51845
ma_uint64 iFrame;
51828
51846
ma_uint32 iChannelOut;
51829
51847
51848
+ #pragma clang loop unroll(disable)
51830
51849
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51831
51850
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51832
51851
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52061,6 +52080,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52061
52080
} else
52062
52081
#endif
52063
52082
{
52083
+ #pragma clang loop vectorize(enable)
52064
52084
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52065
52085
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52066
52086
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52088,6 +52108,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52088
52108
} else
52089
52109
#endif
52090
52110
{
52111
+ #pragma clang loop vectorize(enable)
52091
52112
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52092
52113
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52093
52114
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52105,6 +52126,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52105
52126
} else
52106
52127
#endif
52107
52128
{
52129
+ #pragma clang loop vectorize(enable)
52108
52130
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52109
52131
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52110
52132
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -65254,7 +65276,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
65254
65276
ma_uint64 iFrame;
65255
65277
ma_uint32 iChannel;
65256
65278
const ma_uint32 channels = pNoise->config.channels;
65257
- MA_ASSUME(channels > 0 );
65279
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
65258
65280
65259
65281
if (pNoise->config.format == ma_format_f32) {
65260
65282
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65373,7 +65395,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
65373
65395
ma_uint64 iFrame;
65374
65396
ma_uint32 iChannel;
65375
65397
const ma_uint32 channels = pNoise->config.channels;
65376
- MA_ASSUME(channels > 0 );
65398
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
65377
65399
65378
65400
if (pNoise->config.format == ma_format_f32) {
65379
65401
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65455,7 +65477,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
65455
65477
ma_uint64 iFrame;
65456
65478
ma_uint32 iChannel;
65457
65479
const ma_uint32 channels = pNoise->config.channels;
65458
- MA_ASSUME(channels > 0 );
65480
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
65459
65481
65460
65482
if (pNoise->config.format == ma_format_f32) {
65461
65483
float* pFramesOutF32 = (float*)pFramesOut;
@@ -69634,7 +69656,7 @@ MA_API void ma_debug_fill_pcm_frames_with_sine_wave(float* pFramesOut, ma_uint32
69634
69656
69635
69657
69636
69658
69637
- static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
69659
+ static ma_result ma_mix_pcm_frames_f32(float* MA_RESTRICT pDst, const float* MA_RESTRICT pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
69638
69660
{
69639
69661
ma_uint64 iSample;
69640
69662
ma_uint64 sampleCount;
@@ -69650,10 +69672,12 @@ static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
69650
69672
sampleCount = frameCount * channels;
69651
69673
69652
69674
if (volume == 1) {
69675
+ #pragma clang loop vectorize(enable)
69653
69676
for (iSample = 0; iSample < sampleCount; iSample += 1) {
69654
69677
pDst[iSample] += pSrc[iSample];
69655
69678
}
69656
69679
} else {
69680
+ #pragma clang loop vectorize(enable)
69657
69681
for (iSample = 0; iSample < sampleCount; iSample += 1) {
69658
69682
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
69659
69683
}
0 commit comments