| /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved. |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| /* Copyright (C) 2011 Google Inc. All rights reserved. |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE.WEBKIT file. |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "drc_math.h" |
| #include "drc_kernel.h" |
| |
| #define MAX_PRE_DELAY_FRAMES 1024 |
| #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1) |
| #define DEFAULT_PRE_DELAY_FRAMES 256 |
| #define DIVISION_FRAMES 32 |
| #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1) |
| |
| #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)])) |
| #define assert_on_compile_is_power_of_2(n) \ |
| assert_on_compile((n) != 0 && (((n) & ((n)-1)) == 0)) |
| |
| const float uninitialized_value = -1; |
| static int drc_math_initialized; |
| |
| void dk_init(struct drc_kernel *dk, float sample_rate) |
| { |
| int i; |
| |
| if (!drc_math_initialized) { |
| drc_math_initialized = 1; |
| drc_math_init(); |
| } |
| |
| dk->sample_rate = sample_rate; |
| dk->detector_average = 0; |
| dk->compressor_gain = 1; |
| dk->enabled = 0; |
| dk->processed = 0; |
| dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES; |
| dk->pre_delay_read_index = 0; |
| dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES; |
| dk->max_attack_compression_diff_db = -INFINITY; |
| dk->ratio = uninitialized_value; |
| dk->slope = uninitialized_value; |
| dk->linear_threshold = uninitialized_value; |
| dk->db_threshold = uninitialized_value; |
| dk->db_knee = uninitialized_value; |
| dk->knee_threshold = uninitialized_value; |
| dk->ratio_base = uninitialized_value; |
| dk->K = uninitialized_value; |
| |
| assert_on_compile_is_power_of_2(DIVISION_FRAMES); |
| assert_on_compile(DIVISION_FRAMES % 4 == 0); |
| /* Allocate predelay buffers */ |
| assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES); |
| for (i = 0; i < DRC_NUM_CHANNELS; i++) { |
| size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; |
| dk->pre_delay_buffers[i] = (float *)calloc(1, size); |
| } |
| } |
| |
| void dk_free(struct drc_kernel *dk) |
| { |
| int i; |
| for (i = 0; i < DRC_NUM_CHANNELS; ++i) |
| free(dk->pre_delay_buffers[i]); |
| } |
| |
| /* Sets the pre-delay (lookahead) buffer size */ |
| static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time) |
| { |
| int i; |
| /* Re-configure look-ahead section pre-delay if delay time has |
| * changed. */ |
| unsigned pre_delay_frames = pre_delay_time * dk->sample_rate; |
| pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1); |
| |
| /* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we |
| * won't split a division of samples into two blocks of memory, so it is |
| * easier to process. This may make the actual delay time slightly less |
| * than the specified value, but the difference is less than 1ms. */ |
| pre_delay_frames &= ~DIVISION_FRAMES_MASK; |
| |
| /* We need at least one division buffer, so the incoming data won't |
| * overwrite the output data */ |
| pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES); |
| |
| if (dk->last_pre_delay_frames != pre_delay_frames) { |
| dk->last_pre_delay_frames = pre_delay_frames; |
| for (i = 0; i < DRC_NUM_CHANNELS; ++i) { |
| size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES; |
| memset(dk->pre_delay_buffers[i], 0, size); |
| } |
| |
| dk->pre_delay_read_index = 0; |
| dk->pre_delay_write_index = pre_delay_frames; |
| } |
| } |
| |
| /* Exponential curve for the knee. It is 1st derivative matched at |
| * dk->linear_threshold and asymptotically approaches the value |
| * dk->linear_threshold + 1 / k. |
| * |
| * This is used only when calculating the static curve, not used when actually |
| * compress the input data (knee_curveK below is used instead). |
| */ |
| static float knee_curve(struct drc_kernel *dk, float x, float k) |
| { |
| /* Linear up to threshold. */ |
| if (x < dk->linear_threshold) |
| return x; |
| |
| return dk->linear_threshold + |
| (1 - knee_expf(-k * (x - dk->linear_threshold))) / k; |
| } |
| |
| /* Approximate 1st derivative with input and output expressed in dB. This slope |
| * is equal to the inverse of the compression "ratio". In other words, a |
| * compression ratio of 20 would be a slope of 1/20. |
| */ |
| static float slope_at(struct drc_kernel *dk, float x, float k) |
| { |
| if (x < dk->linear_threshold) |
| return 1; |
| |
| float x2 = x * 1.001; |
| |
| float x_db = linear_to_decibels(x); |
| float x2Db = linear_to_decibels(x2); |
| |
| float y_db = linear_to_decibels(knee_curve(dk, x, k)); |
| float y2Db = linear_to_decibels(knee_curve(dk, x2, k)); |
| |
| float m = (y2Db - y_db) / (x2Db - x_db); |
| |
| return m; |
| } |
| |
| static float k_at_slope(struct drc_kernel *dk, float desired_slope) |
| { |
| float x_db = dk->db_threshold + dk->db_knee; |
| float x = decibels_to_linear(x_db); |
| |
| /* Approximate k given initial values. */ |
| float minK = 0.1; |
| float maxK = 10000; |
| float k = 5; |
| int i; |
| |
| for (i = 0; i < 15; ++i) { |
| /* A high value for k will more quickly asymptotically approach |
| * a slope of 0. */ |
| float slope = slope_at(dk, x, k); |
| |
| if (slope < desired_slope) { |
| /* k is too high. */ |
| maxK = k; |
| } else { |
| /* k is too low. */ |
| minK = k; |
| } |
| |
| /* Re-calculate based on geometric mean. */ |
| k = sqrtf(minK * maxK); |
| } |
| |
| return k; |
| } |
| |
| static void update_static_curve_parameters(struct drc_kernel *dk, |
| float db_threshold, float db_knee, |
| float ratio) |
| { |
| if (db_threshold != dk->db_threshold || db_knee != dk->db_knee || |
| ratio != dk->ratio) { |
| /* Threshold and knee. */ |
| dk->db_threshold = db_threshold; |
| dk->linear_threshold = decibels_to_linear(db_threshold); |
| dk->db_knee = db_knee; |
| |
| /* Compute knee parameters. */ |
| dk->ratio = ratio; |
| dk->slope = 1 / dk->ratio; |
| |
| float k = k_at_slope(dk, 1 / dk->ratio); |
| dk->K = k; |
| /* See knee_curveK() for details */ |
| dk->knee_alpha = dk->linear_threshold + 1 / k; |
| dk->knee_beta = -expf(k * dk->linear_threshold) / k; |
| |
| dk->knee_threshold = decibels_to_linear(db_threshold + db_knee); |
| /* See volume_gain() for details */ |
| float y0 = knee_curve(dk, dk->knee_threshold, k); |
| dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope); |
| } |
| } |
| |
| /* This is the knee part of the compression curve. Returns the output level |
| * given the input level x. */ |
| static float knee_curveK(struct drc_kernel *dk, float x) |
| { |
| /* The formula in knee_curveK is dk->linear_threshold + |
| * (1 - expf(-k * (x - dk->linear_threshold))) / k |
| * which simplifies to (alpha + beta * expf(gamma)) |
| * where alpha = dk->linear_threshold + 1 / k |
| * beta = -expf(k * dk->linear_threshold) / k |
| * gamma = -k * x |
| */ |
| return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x); |
| } |
| |
| /* Full compression curve with constant ratio after knee. Returns the ratio of |
| * output and input signal. */ |
| static float volume_gain(struct drc_kernel *dk, float x) |
| { |
| float y; |
| |
| if (x < dk->knee_threshold) { |
| if (x < dk->linear_threshold) |
| return 1; |
| y = knee_curveK(dk, x) / x; |
| } else { |
| /* Constant ratio after knee. |
| * log(y/y0) = s * log(x/x0) |
| * => y = y0 * (x/x0)^s |
| * => y = [y0 * (1/x0)^s] * x^s |
| * => y = dk->ratio_base * x^s |
| * => y/x = dk->ratio_base * x^(s - 1) |
| * => y/x = dk->ratio_base * e^(log(x) * (s - 1)) |
| */ |
| y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1)); |
| } |
| |
| return y; |
| } |
| |
| void dk_set_parameters(struct drc_kernel *dk, float db_threshold, float db_knee, |
| float ratio, float attack_time, float release_time, |
| float pre_delay_time, float db_post_gain, |
| float releaseZone1, float releaseZone2, |
| float releaseZone3, float releaseZone4) |
| { |
| float sample_rate = dk->sample_rate; |
| |
| update_static_curve_parameters(dk, db_threshold, db_knee, ratio); |
| |
| /* Makeup gain. */ |
| float full_range_gain = volume_gain(dk, 1); |
| float full_range_makeup_gain = 1 / full_range_gain; |
| |
| /* Empirical/perceptual tuning. */ |
| full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f); |
| |
| dk->main_linear_gain = |
| decibels_to_linear(db_post_gain) * full_range_makeup_gain; |
| |
| /* Attack parameters. */ |
| attack_time = max(0.001f, attack_time); |
| dk->attack_frames = attack_time * sample_rate; |
| |
| /* Release parameters. */ |
| float release_frames = sample_rate * release_time; |
| |
| /* Detector release time. */ |
| float sat_release_time = 0.0025f; |
| float sat_release_frames = sat_release_time * sample_rate; |
| dk->sat_release_frames_inv_neg = -1 / sat_release_frames; |
| dk->sat_release_rate_at_neg_two_db = |
| decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1; |
| |
| /* Create a smooth function which passes through four points. |
| * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4 |
| */ |
| float y1 = release_frames * releaseZone1; |
| float y2 = release_frames * releaseZone2; |
| float y3 = release_frames * releaseZone3; |
| float y4 = release_frames * releaseZone4; |
| |
| /* All of these coefficients were derived for 4th order polynomial curve |
| * fitting where the y values match the evenly spaced x values as |
| * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3) |
| */ |
| dk->kA = 0.9999999999999998f * y1 + 1.8432219684323923e-16f * y2 - |
| 1.9373394351676423e-16f * y3 + 8.824516011816245e-18f * y4; |
| dk->kB = -1.5788320352845888f * y1 + 2.3305837032074286f * y2 - |
| 0.9141194204840429f * y3 + 0.1623677525612032f * y4; |
| dk->kC = 0.5334142869106424f * y1 - 1.272736789213631f * y2 + |
| 0.9258856042207512f * y3 - 0.18656310191776226f * y4; |
| dk->kD = 0.08783463138207234f * y1 - 0.1694162967925622f * y2 + |
| 0.08588057951595272f * y3 - 0.00429891410546283f * y4; |
| dk->kE = -0.042416883008123074f * y1 + 0.1115693827987602f * y2 - |
| 0.09764676325265872f * y3 + 0.028494263462021576f * y4; |
| |
| /* x ranges from 0 -> 3 0 1 2 3 |
| * -15 -10 -5 0db |
| * |
| * y calculates adaptive release frames depending on the amount of |
| * compression. |
| */ |
| set_pre_delay_time(dk, pre_delay_time); |
| } |
| |
| void dk_set_enabled(struct drc_kernel *dk, int enabled) |
| { |
| dk->enabled = enabled; |
| } |
| |
| /* Updates the envelope_rate used for the next division */ |
| static void dk_update_envelope(struct drc_kernel *dk) |
| { |
| const float kA = dk->kA; |
| const float kB = dk->kB; |
| const float kC = dk->kC; |
| const float kD = dk->kD; |
| const float kE = dk->kE; |
| const float attack_frames = dk->attack_frames; |
| |
| /* Calculate desired gain */ |
| float desired_gain = dk->detector_average; |
| |
| /* Pre-warp so we get desired_gain after sin() warp below. */ |
| float scaled_desired_gain = warp_asinf(desired_gain); |
| |
| /* Deal with envelopes */ |
| |
| /* envelope_rate is the rate we slew from current compressor level to |
| * the desired level. The exact rate depends on if we're attacking or |
| * releasing and by how much. |
| */ |
| float envelope_rate; |
| |
| int is_releasing = scaled_desired_gain > dk->compressor_gain; |
| |
| /* compression_diff_db is the difference between current compression |
| * level and the desired level. */ |
| float compression_diff_db = |
| linear_to_decibels(dk->compressor_gain / scaled_desired_gain); |
| |
| if (is_releasing) { |
| /* Release mode - compression_diff_db should be negative dB */ |
| dk->max_attack_compression_diff_db = -INFINITY; |
| |
| /* Fix gremlins. */ |
| if (isbadf(compression_diff_db)) |
| compression_diff_db = -1; |
| |
| /* Adaptive release - higher compression (lower |
| * compression_diff_db) releases faster. Contain within range: |
| * -12 -> 0 then scale to go from 0 -> 3 |
| */ |
| float x = compression_diff_db; |
| x = max(-12.0f, x); |
| x = min(0.0f, x); |
| x = 0.25f * (x + 12); |
| |
| /* Compute adaptive release curve using 4th order polynomial. |
| * Normal values for the polynomial coefficients would create a |
| * monotonically increasing function. |
| */ |
| float x2 = x * x; |
| float x3 = x2 * x; |
| float x4 = x2 * x2; |
| float release_frames = |
| kA + kB * x + kC * x2 + kD * x3 + kE * x4; |
| |
| #define kSpacingDb 5 |
| float db_per_frame = kSpacingDb / release_frames; |
| envelope_rate = decibels_to_linear(db_per_frame); |
| } else { |
| /* Attack mode - compression_diff_db should be positive dB */ |
| |
| /* Fix gremlins. */ |
| if (isbadf(compression_diff_db)) |
| compression_diff_db = 1; |
| |
| /* As long as we're still in attack mode, use a rate based off |
| * the largest compression_diff_db we've encountered so far. |
| */ |
| dk->max_attack_compression_diff_db = |
| max(dk->max_attack_compression_diff_db, |
| compression_diff_db); |
| |
| float eff_atten_diff_db = |
| max(0.5f, dk->max_attack_compression_diff_db); |
| |
| float x = 0.25f / eff_atten_diff_db; |
| envelope_rate = 1 - powf(x, 1 / attack_frames); |
| } |
| |
| dk->envelope_rate = envelope_rate; |
| dk->scaled_desired_gain = scaled_desired_gain; |
| } |
| |
| /* For a division of frames, take the absolute values of left channel and right |
| * channel, store the maximum of them in output. */ |
| #if defined(__aarch64__) |
| static inline void max_abs_division(float *output, const float *data0, |
| const float *data1) |
| { |
| int count = DIVISION_FRAMES / 4; |
| |
| // clang-format off |
| __asm__ __volatile__( |
| "1: \n" |
| "ld1 {v0.4s}, [%[data0]], #16 \n" |
| "ld1 {v1.4s}, [%[data1]], #16 \n" |
| "fabs v0.4s, v0.4s \n" |
| "fabs v1.4s, v1.4s \n" |
| "fmax v0.4s, v0.4s, v1.4s \n" |
| "st1 {v0.4s}, [%[output]], #16 \n" |
| "subs %w[count], %w[count], #1 \n" |
| "b.ne 1b \n" |
| : /* output */ |
| [data0]"+r"(data0), |
| [data1]"+r"(data1), |
| [output]"+r"(output), |
| [count]"+r"(count) |
| : /* input */ |
| : /* clobber */ |
| "v0", "v1", "memory", "cc"); |
| // clang-format on |
| } |
| #elif defined(__ARM_NEON__) |
| static inline void max_abs_division(float *output, const float *data0, |
| const float *data1) |
| { |
| int count = DIVISION_FRAMES / 4; |
| |
| // clang-format off |
| __asm__ __volatile__( |
| "1: \n" |
| "vld1.32 {q0}, [%[data0]]! \n" |
| "vld1.32 {q1}, [%[data1]]! \n" |
| "vabs.f32 q0, q0 \n" |
| "vabs.f32 q1, q1 \n" |
| "vmax.f32 q0, q1 \n" |
| "vst1.32 {q0}, [%[output]]! \n" |
| "subs %[count], #1 \n" |
| "bne 1b \n" |
| : /* output */ |
| [data0]"+r"(data0), |
| [data1]"+r"(data1), |
| [output]"+r"(output), |
| [count]"+r"(count) |
| : /* input */ |
| : /* clobber */ |
| "q0", "q1", "memory", "cc"); |
| // clang-format on |
| } |
| #elif defined(__SSE3__) |
| #include <emmintrin.h> |
| static inline void max_abs_division(float *output, const float *data0, |
| const float *data1) |
| { |
| __m128 x, y; |
| int count = DIVISION_FRAMES / 4; |
| // clang-format off |
| __asm__ __volatile__( |
| "1: \n" |
| "lddqu (%[data0]), %[x] \n" |
| "lddqu (%[data1]), %[y] \n" |
| "andps %[mask], %[x] \n" |
| "andps %[mask], %[y] \n" |
| "maxps %[y], %[x] \n" |
| "movdqu %[x], (%[output]) \n" |
| "add $16, %[data0] \n" |
| "add $16, %[data1] \n" |
| "add $16, %[output] \n" |
| "sub $1, %[count] \n" |
| "jnz 1b \n" |
| : /* output */ |
| [data0]"+r"(data0), |
| [data1]"+r"(data1), |
| [output]"+r"(output), |
| [count]"+r"(count), |
| [x]"=&x"(x), |
| [y]"=&x"(y) |
| : /* input */ |
| [mask]"x"(_mm_set1_epi32(0x7fffffff)) |
| : /* clobber */ |
| "memory", "cc"); |
| // clang-format on |
| } |
| #else |
| static inline void max_abs_division(float *output, const float *data0, |
| const float *data1) |
| { |
| int i; |
| for (i = 0; i < DIVISION_FRAMES; i++) |
| output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i])); |
| } |
| #endif |
| |
| /* Update detector_average from the last input division. */ |
| static void dk_update_detector_average(struct drc_kernel *dk) |
| { |
| float abs_input_array[DIVISION_FRAMES]; |
| const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg; |
| const float sat_release_rate_at_neg_two_db = |
| dk->sat_release_rate_at_neg_two_db; |
| float detector_average = dk->detector_average; |
| int div_start, i; |
| |
| /* Calculate the start index of the last input division */ |
| if (dk->pre_delay_write_index == 0) { |
| div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES; |
| } else { |
| div_start = dk->pre_delay_write_index - DIVISION_FRAMES; |
| } |
| |
| /* The max abs value across all channels for this frame */ |
| max_abs_division(abs_input_array, &dk->pre_delay_buffers[0][div_start], |
| &dk->pre_delay_buffers[1][div_start]); |
| |
| for (i = 0; i < DIVISION_FRAMES; i++) { |
| /* Compute compression amount from un-delayed signal */ |
| float abs_input = abs_input_array[i]; |
| |
| /* Calculate shaped power on undelayed input. Put through |
| * shaping curve. This is linear up to the threshold, then |
| * enters a "knee" portion followed by the "ratio" portion. The |
| * transition from the threshold to the knee is smooth (1st |
| * derivative matched). The transition from the knee to the |
| * ratio portion is smooth (1st derivative matched). |
| */ |
| float gain = volume_gain(dk, abs_input); |
| int is_release = (gain > detector_average); |
| if (is_release) { |
| if (gain > NEG_TWO_DB) { |
| detector_average += |
| (gain - detector_average) * |
| sat_release_rate_at_neg_two_db; |
| } else { |
| float gain_db = linear_to_decibels(gain); |
| float db_per_frame = |
| gain_db * sat_release_frames_inv_neg; |
| float sat_release_rate = |
| decibels_to_linear(db_per_frame) - 1; |
| detector_average += (gain - detector_average) * |
| sat_release_rate; |
| } |
| } else { |
| detector_average = gain; |
| } |
| |
| /* Fix gremlins. */ |
| if (isbadf(detector_average)) |
| detector_average = 1.0f; |
| else |
| detector_average = min(detector_average, 1.0f); |
| } |
| |
| dk->detector_average = detector_average; |
| } |
| |
| /* Calculate compress_gain from the envelope and apply total_gain to compress |
| * the next output division. */ |
| /* TODO(fbarchard): Port to aarch64 */ |
| #if defined(__ARM_NEON__) |
| #include <arm_neon.h> |
| static void dk_compress_output(struct drc_kernel *dk) |
| { |
| const float main_linear_gain = dk->main_linear_gain; |
| const float envelope_rate = dk->envelope_rate; |
| const float scaled_desired_gain = dk->scaled_desired_gain; |
| const float compressor_gain = dk->compressor_gain; |
| const int div_start = dk->pre_delay_read_index; |
| float *ptr_left = &dk->pre_delay_buffers[0][div_start]; |
| float *ptr_right = &dk->pre_delay_buffers[1][div_start]; |
| int count = DIVISION_FRAMES / 4; |
| |
| /* See warp_sinf() for the details for the constants. */ |
| const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f); |
| const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f); |
| const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f); |
| const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f); |
| |
| /* Exponential approach to desired gain. */ |
| if (envelope_rate < 1) { |
| float c = compressor_gain - scaled_desired_gain; |
| float r = 1 - envelope_rate; |
| float32x4_t x0 = { c * r, c * r * r, c * r * r * r, |
| c * r * r * r * r }; |
| float32x4_t x, x2, x4, left, right, tmp1, tmp2; |
| |
| // clang-format off |
| __asm__ __volatile( |
| "b 2f \n" |
| "1: \n" |
| "vmul.f32 %q[x0], %q[r4] \n" |
| "2: \n" |
| "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" |
| "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" |
| "vadd.f32 %q[x], %q[x0], %q[base] \n" |
| /* Calculate warp_sin() for four values in x. */ |
| "vmul.f32 %q[x2], %q[x], %q[x] \n" |
| "vmov.f32 %q[tmp1], %q[A5] \n" |
| "vmov.f32 %q[tmp2], %q[A1] \n" |
| "vmul.f32 %q[x4], %q[x2], %q[x2] \n" |
| "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" |
| "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" |
| "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" |
| "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" |
| /* Now tmp2 contains the result of warp_sin(). */ |
| "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" |
| "vmul.f32 %q[left], %q[tmp2] \n" |
| "vmul.f32 %q[right], %q[tmp2] \n" |
| "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" |
| "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" |
| "subs %[count], #1 \n" |
| "bne 1b \n" |
| : /* output */ |
| "=r"(count), |
| "=r"(ptr_left), |
| "=r"(ptr_right), |
| "=w"(x0), |
| [x]"=&w"(x), |
| [x2]"=&w"(x2), |
| [x4]"=&w"(x4), |
| [left]"=&w"(left), |
| [right]"=&w"(right), |
| [tmp1]"=&w"(tmp1), |
| [tmp2]"=&w"(tmp2) |
| : /* input */ |
| [count]"0"(count), |
| [ptr_left]"1"(ptr_left), |
| [ptr_right]"2"(ptr_right), |
| [x0]"3"(x0), |
| [A1]"w"(A1), |
| [A3]"w"(A3), |
| [A5]"w"(A5), |
| [A7]"w"(A7), |
| [base]"w"(vdupq_n_f32(scaled_desired_gain)), |
| [r4]"w"(vdupq_n_f32(r*r*r*r)), |
| [g]"w"(vdupq_n_f32(main_linear_gain)) |
| : /* clobber */ |
| "memory", "cc"); |
| // clang-format on |
| dk->compressor_gain = x[3]; |
| } else { |
| float c = compressor_gain; |
| float r = envelope_rate; |
| float32x4_t x = { c * r, c * r * r, c * r * r * r, |
| c * r * r * r * r }; |
| float32x4_t x2, x4, left, right, tmp1, tmp2; |
| |
| // clang-format off |
| __asm__ __volatile( |
| "b 2f \n" |
| "1: \n" |
| "vmul.f32 %q[x], %q[r4] \n" |
| "2: \n" |
| "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n" |
| "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n" |
| "vmin.f32 %q[x], %q[one] \n" |
| /* Calculate warp_sin() for four values in x. */ |
| "vmul.f32 %q[x2], %q[x], %q[x] \n" |
| "vmov.f32 %q[tmp1], %q[A5] \n" |
| "vmov.f32 %q[tmp2], %q[A1] \n" |
| "vmul.f32 %q[x4], %q[x2], %q[x2] \n" |
| "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n" |
| "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n" |
| "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n" |
| "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n" |
| /* Now tmp2 contains the result of warp_sin(). */ |
| "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n" |
| "vmul.f32 %q[left], %q[tmp2] \n" |
| "vmul.f32 %q[right], %q[tmp2] \n" |
| "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n" |
| "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n" |
| "subs %[count], #1 \n" |
| "bne 1b \n" |
| : /* output */ |
| "=r"(count), |
| "=r"(ptr_left), |
| "=r"(ptr_right), |
| "=w"(x), |
| [x2]"=&w"(x2), |
| [x4]"=&w"(x4), |
| [left]"=&w"(left), |
| [right]"=&w"(right), |
| [tmp1]"=&w"(tmp1), |
| [tmp2]"=&w"(tmp2) |
| : /* input */ |
| [count]"0"(count), |
| [ptr_left]"1"(ptr_left), |
| [ptr_right]"2"(ptr_right), |
| [x]"3"(x), |
| [A1]"w"(A1), |
| [A3]"w"(A3), |
| [A5]"w"(A5), |
| [A7]"w"(A7), |
| [one]"w"(vdupq_n_f32(1)), |
| [r4]"w"(vdupq_n_f32(r*r*r*r)), |
| [g]"w"(vdupq_n_f32(main_linear_gain)) |
| : /* clobber */ |
| "memory", "cc"); |
| // clang-format on |
| dk->compressor_gain = x[3]; |
| } |
| } |
| #elif defined(__SSE3__) && defined(__x86_64__) |
| #include <emmintrin.h> |
| static void dk_compress_output(struct drc_kernel *dk) |
| { |
| const float main_linear_gain = dk->main_linear_gain; |
| const float envelope_rate = dk->envelope_rate; |
| const float scaled_desired_gain = dk->scaled_desired_gain; |
| const float compressor_gain = dk->compressor_gain; |
| const int div_start = dk->pre_delay_read_index; |
| float *ptr_left = &dk->pre_delay_buffers[0][div_start]; |
| float *ptr_right = &dk->pre_delay_buffers[1][div_start]; |
| int count = DIVISION_FRAMES / 4; |
| |
| /* See warp_sinf() for the details for the constants. */ |
| const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); |
| const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); |
| const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); |
| const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); |
| |
| /* Exponential approach to desired gain. */ |
| if (envelope_rate < 1) { |
| float c = compressor_gain - scaled_desired_gain; |
| float r = 1 - envelope_rate; |
| __m128 x0 = { c * r, c * r * r, c * r * r * r, |
| c * r * r * r * r }; |
| __m128 x, x2, x4, left, right, tmp1, tmp2; |
| |
| // clang-format off |
| __asm__ __volatile( |
| "jmp 2f \n" |
| "1: \n" |
| "mulps %[r4], %[x0] \n" |
| "2: \n" |
| "lddqu (%[ptr_left]), %[left] \n" |
| "lddqu (%[ptr_right]), %[right] \n" |
| "movaps %[x0], %[x] \n" |
| "addps %[base], %[x] \n" |
| /* Calculate warp_sin() for four values in x. */ |
| "movaps %[x], %[x2] \n" |
| "mulps %[x], %[x2] \n" |
| "movaps %[x2], %[x4] \n" |
| "movaps %[x2], %[tmp1] \n" |
| "movaps %[x2], %[tmp2] \n" |
| "mulps %[x2], %[x4] \n" |
| "mulps %[A7], %[tmp1] \n" |
| "mulps %[A3], %[tmp2] \n" |
| "addps %[A5], %[tmp1] \n" |
| "addps %[A1], %[tmp2] \n" |
| "mulps %[x4], %[tmp1] \n" |
| "addps %[tmp1], %[tmp2] \n" |
| "mulps %[x], %[tmp2] \n" |
| /* Now tmp2 contains the result of warp_sin(). */ |
| "mulps %[g], %[tmp2] \n" |
| "mulps %[tmp2], %[left] \n" |
| "mulps %[tmp2], %[right] \n" |
| "movdqu %[left], (%[ptr_left]) \n" |
| "movdqu %[right], (%[ptr_right]) \n" |
| "add $16, %[ptr_left] \n" |
| "add $16, %[ptr_right] \n" |
| "sub $1, %[count] \n" |
| "jne 1b \n" |
| : /* output */ |
| "=r"(count), |
| "=r"(ptr_left), |
| "=r"(ptr_right), |
| "=x"(x0), |
| [x]"=&x"(x), |
| [x2]"=&x"(x2), |
| [x4]"=&x"(x4), |
| [left]"=&x"(left), |
| [right]"=&x"(right), |
| [tmp1]"=&x"(tmp1), |
| [tmp2]"=&x"(tmp2) |
| : /* input */ |
| [count]"0"(count), |
| [ptr_left]"1"(ptr_left), |
| [ptr_right]"2"(ptr_right), |
| [x0]"3"(x0), |
| [A1]"x"(A1), |
| [A3]"x"(A3), |
| [A5]"x"(A5), |
| [A7]"x"(A7), |
| [base]"x"(_mm_set1_ps(scaled_desired_gain)), |
| [r4]"x"(_mm_set1_ps(r*r*r*r)), |
| [g]"x"(_mm_set1_ps(main_linear_gain)) |
| : /* clobber */ |
| "memory", "cc"); |
| // clang-format on |
| dk->compressor_gain = x[3]; |
| } else { |
| /* See warp_sinf() for the details for the constants. */ |
| __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f); |
| __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f); |
| __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f); |
| __m128 A1 = _mm_set1_ps(1.5707910060882568359375f); |
| |
| float c = compressor_gain; |
| float r = envelope_rate; |
| __m128 x = { c * r, c * r * r, c * r * r * r, |
| c * r * r * r * r }; |
| __m128 x2, x4, left, right, tmp1, tmp2; |
| |
| // clang-format off |
| __asm__ __volatile( |
| "jmp 2f \n" |
| "1: \n" |
| "mulps %[r4], %[x] \n" |
| "2: \n" |
| "lddqu (%[ptr_left]), %[left] \n" |
| "lddqu (%[ptr_right]), %[right] \n" |
| "minps %[one], %[x] \n" |
| /* Calculate warp_sin() for four values in x. */ |
| "movaps %[x], %[x2] \n" |
| "mulps %[x], %[x2] \n" |
| "movaps %[x2], %[x4] \n" |
| "movaps %[x2], %[tmp1] \n" |
| "movaps %[x2], %[tmp2] \n" |
| "mulps %[x2], %[x4] \n" |
| "mulps %[A7], %[tmp1] \n" |
| "mulps %[A3], %[tmp2] \n" |
| "addps %[A5], %[tmp1] \n" |
| "addps %[A1], %[tmp2] \n" |
| "mulps %[x4], %[tmp1] \n" |
| "addps %[tmp1], %[tmp2] \n" |
| "mulps %[x], %[tmp2] \n" |
| /* Now tmp2 contains the result of warp_sin(). */ |
| "mulps %[g], %[tmp2] \n" |
| "mulps %[tmp2], %[left] \n" |
| "mulps %[tmp2], %[right] \n" |
| "movdqu %[left], (%[ptr_left]) \n" |
| "movdqu %[right], (%[ptr_right]) \n" |
| "add $16, %[ptr_left] \n" |
| "add $16, %[ptr_right] \n" |
| "sub $1, %[count] \n" |
| "jne 1b \n" |
| : /* output */ |
| "=r"(count), |
| "=r"(ptr_left), |
| "=r"(ptr_right), |
| "=x"(x), |
| [x2]"=&x"(x2), |
| [x4]"=&x"(x4), |
| [left]"=&x"(left), |
| [right]"=&x"(right), |
| [tmp1]"=&x"(tmp1), |
| [tmp2]"=&x"(tmp2) |
| : /* input */ |
| [count]"0"(count), |
| [ptr_left]"1"(ptr_left), |
| [ptr_right]"2"(ptr_right), |
| [x]"3"(x), |
| [A1]"x"(A1), |
| [A3]"x"(A3), |
| [A5]"x"(A5), |
| [A7]"x"(A7), |
| [one]"x"(_mm_set1_ps(1)), |
| [r4]"x"(_mm_set1_ps(r*r*r*r)), |
| [g]"x"(_mm_set1_ps(main_linear_gain)) |
| : /* clobber */ |
| "memory", "cc"); |
| // clang-format on |
| dk->compressor_gain = x[3]; |
| } |
| } |
| #else |
| static void dk_compress_output(struct drc_kernel *dk) |
| { |
| const float main_linear_gain = dk->main_linear_gain; |
| const float envelope_rate = dk->envelope_rate; |
| const float scaled_desired_gain = dk->scaled_desired_gain; |
| const float compressor_gain = dk->compressor_gain; |
| const int div_start = dk->pre_delay_read_index; |
| float *ptr_left = &dk->pre_delay_buffers[0][div_start]; |
| float *ptr_right = &dk->pre_delay_buffers[1][div_start]; |
| int count = DIVISION_FRAMES / 4; |
| |
| int i, j; |
| |
| /* Exponential approach to desired gain. */ |
| if (envelope_rate < 1) { |
| /* Attack - reduce gain to desired. */ |
| float c = compressor_gain - scaled_desired_gain; |
| float base = scaled_desired_gain; |
| float r = 1 - envelope_rate; |
| float x[4] = { c * r, c * r * r, c * r * r * r, |
| c * r * r * r * r }; |
| float r4 = r * r * r * r; |
| |
| i = 0; |
| while (1) { |
| for (j = 0; j < 4; j++) { |
| /* Warp pre-compression gain to smooth out sharp |
| * exponential transition points. |
| */ |
| float post_warp_compressor_gain = |
| warp_sinf(x[j] + base); |
| |
| /* Calculate total gain using main gain. */ |
| float total_gain = main_linear_gain * |
| post_warp_compressor_gain; |
| |
| /* Apply final gain. */ |
| *ptr_left++ *= total_gain; |
| *ptr_right++ *= total_gain; |
| } |
| |
| if (++i == count) |
| break; |
| |
| for (j = 0; j < 4; j++) |
| x[j] = x[j] * r4; |
| } |
| |
| dk->compressor_gain = x[3] + base; |
| } else { |
| /* Release - exponentially increase gain to 1.0 */ |
| float c = compressor_gain; |
| float r = envelope_rate; |
| float x[4] = { c * r, c * r * r, c * r * r * r, |
| c * r * r * r * r }; |
| float r4 = r * r * r * r; |
| |
| i = 0; |
| while (1) { |
| for (j = 0; j < 4; j++) { |
| /* Warp pre-compression gain to smooth out sharp |
| * exponential transition points. |
| */ |
| float post_warp_compressor_gain = |
| warp_sinf(x[j]); |
| |
| /* Calculate total gain using main gain. */ |
| float total_gain = main_linear_gain * |
| post_warp_compressor_gain; |
| |
| /* Apply final gain. */ |
| *ptr_left++ *= total_gain; |
| *ptr_right++ *= total_gain; |
| } |
| |
| if (++i == count) |
| break; |
| |
| for (j = 0; j < 4; j++) |
| x[j] = min(1.0f, x[j] * r4); |
| } |
| |
| dk->compressor_gain = x[3]; |
| } |
| } |
| #endif |
| |
| /* After one complete divison of samples have been received (and one divison of |
| * samples have been output), we calculate shaped power average |
| * (detector_average) from the input division, update envelope parameters from |
| * detector_average, then prepare the next output division by applying the |
| * envelope to compress the samples. |
| */ |
| static void dk_process_one_division(struct drc_kernel *dk) |
| { |
| dk_update_detector_average(dk); |
| dk_update_envelope(dk); |
| dk_compress_output(dk); |
| } |
| |
| /* Copy the input data to the pre-delay buffer, and copy the output data back to |
| * the input buffer */ |
| static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[], |
| unsigned frame_index, int frames_to_process) |
| { |
| int write_index = dk->pre_delay_write_index; |
| int read_index = dk->pre_delay_read_index; |
| int j; |
| |
| for (j = 0; j < DRC_NUM_CHANNELS; ++j) { |
| memcpy(&dk->pre_delay_buffers[j][write_index], |
| &data_channels[j][frame_index], |
| frames_to_process * sizeof(float)); |
| memcpy(&data_channels[j][frame_index], |
| &dk->pre_delay_buffers[j][read_index], |
| frames_to_process * sizeof(float)); |
| } |
| |
| dk->pre_delay_write_index = |
| (write_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK; |
| dk->pre_delay_read_index = |
| (read_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK; |
| } |
| |
| /* Delay the input sample only and don't do other processing. This is used when |
| * the kernel is disabled. We want to do this to match the processing delay in |
| * kernels of other bands. |
| */ |
| static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[], |
| unsigned count) |
| { |
| int read_index = dk->pre_delay_read_index; |
| int write_index = dk->pre_delay_write_index; |
| int i = 0; |
| |
| while (i < count) { |
| int j; |
| int small = min(read_index, write_index); |
| int large = max(read_index, write_index); |
| /* chunk is the minimum of readable samples in contiguous |
| * buffer, writable samples in contiguous buffer, and the |
| * available input samples. */ |
| int chunk = min(large - small, MAX_PRE_DELAY_FRAMES - large); |
| chunk = min(chunk, count - i); |
| for (j = 0; j < DRC_NUM_CHANNELS; ++j) { |
| memcpy(&dk->pre_delay_buffers[j][write_index], |
| &data_channels[j][i], chunk * sizeof(float)); |
| memcpy(&data_channels[j][i], |
| &dk->pre_delay_buffers[j][read_index], |
| chunk * sizeof(float)); |
| } |
| read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; |
| write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK; |
| i += chunk; |
| } |
| |
| dk->pre_delay_read_index = read_index; |
| dk->pre_delay_write_index = write_index; |
| } |
| |
| void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count) |
| { |
| int i = 0; |
| int fragment; |
| |
| if (!dk->enabled) { |
| dk_process_delay_only(dk, data_channels, count); |
| return; |
| } |
| |
| if (!dk->processed) { |
| dk_update_envelope(dk); |
| dk_compress_output(dk); |
| dk->processed = 1; |
| } |
| |
| int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK; |
| while (i < count) { |
| fragment = min(DIVISION_FRAMES - offset, count - i); |
| dk_copy_fragment(dk, data_channels, i, fragment); |
| i += fragment; |
| offset = (offset + fragment) & DIVISION_FRAMES_MASK; |
| |
| /* Process the input division (32 frames). */ |
| if (offset == 0) |
| dk_process_one_division(dk); |
| } |
| } |