blob: c2f481bcd7dc49919dfe313d0c2cff23e6936f85 [file] [log] [blame]
/*
* Copyright (C) 2020 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
*
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
#define ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
#include "intrinsic_utils.h"
#include <array>
#include <cmath>
#include <functional>
#include <utility>
#include <vector>
#include <assert.h>
// We conditionally include neon optimizations for ARM devices
#pragma push_macro("USE_NEON")
#undef USE_NEON
#if defined(__ARM_NEON__) || defined(__aarch64__)
#include <arm_neon.h>
#define USE_NEON
#endif
namespace android::audio_utils {
static constexpr size_t kBiquadNumCoefs = 5;
static constexpr size_t kBiquadNumDelays = 2;
namespace details {
// Helper methods for constructing a constexpr array of function pointers.
// As function pointers are efficient and have no constructor/destructor
// this is preferred over std::function.
//
// SC stands for SAME_COEF_PER_CHANNEL, a compile time boolean constant.
template <template <size_t, bool, typename ...> typename F, bool SC, size_t... Is>
static inline constexpr auto make_functional_array_from_index_sequence(std::index_sequence<Is...>) {
using first_t = decltype(&F<0, false>::func); // type from function
using result_t = std::array<first_t, sizeof...(Is)>; // type of array
return result_t{{F<Is, SC>::func...}}; // initialize with functions.
}
template <template <size_t, bool, typename ...> typename F, size_t M, bool SC>
static inline constexpr auto make_functional_array() {
return make_functional_array_from_index_sequence<F, SC>(std::make_index_sequence<M>());
}
// Returns true if the poles are stable for a Biquad.
template <typename D>
static inline constexpr bool isStable(const D& a1, const D& a2) {
return fabs(a2) < D(1) && fabs(a1) < D(1) + a2;
}
// Simplifies Biquad coefficients.
// TODO: consider matched pole/zero cancellation.
// consider subnormal elimination for Intel processors.
template <typename D, typename T>
std::array<D, kBiquadNumCoefs> reduceCoefficients(const T& coef) {
std::array<D, kBiquadNumCoefs> lcoef;
if (coef.size() == kBiquadNumCoefs + 1) {
// General form of Biquad.
// Remove matched z^-1 factors in top and bottom (e.g. coefs[0] == coefs[3] == 0).
size_t offset = 0;
for (; offset < 2 && coef[offset] == 0 && coef[offset + 3] == 0; ++offset);
assert(coefs[offset + 3] != 0); // hmm... shouldn't we be causal?
// Normalize 6 coefficients to 5 for storage.
lcoef[0] = coef[offset] / coef[offset + 3];
for (size_t i = 1; i + offset < 3; ++i) {
lcoef[i] = coef[i + offset] / coef[offset + 3];
lcoef[i + 2] = coef[i + offset + 3] / coef[offset + 3];
}
} else if (coef.size() == kBiquadNumCoefs) {
std::copy(coef.begin(), coef.end(), lcoef.begin());
} else {
assert(coef.size() == kBiquadNumCoefs + 1 || coef.size() == kBiquadNumCoefs);
}
return lcoef;
}
// Sets a container of coefficients to storage.
template <typename D, typename T, typename DEST>
static inline void setCoefficients(
DEST& dest, size_t offset, size_t stride, size_t channelCount, const T& coef) {
auto lcoef = reduceCoefficients<D, T>(coef);
// replicate as needed
for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
for (size_t j = 0; j < channelCount; ++j) {
dest[i * stride + offset + j] = lcoef[i];
}
}
}
// For biquad_filter_fast, we template based on whether coef[i] is nonzero - this should be
// determined in a constexpr fashion for optimization.
// Helper which takes a stride to allow column processing of interleaved audio streams.
template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
void biquad_filter_1fast(D *out, const D *in, size_t frames, size_t stride,
size_t channelCount, D *delays, const D *coefs, size_t localStride) {
#if defined(__i386__) || defined(__x86_x64__)
D delta = std::numeric_limits<float>::min() * (1 << 24);
#endif
D b0, b1, b2, negativeA1, negativeA2;
if constexpr (SAME_COEF_PER_CHANNEL) {
b0 = coefs[0];
b1 = coefs[1];
b2 = coefs[2];
negativeA1 = -coefs[3];
negativeA2 = -coefs[4];
}
for (size_t i = 0; i < channelCount; ++i) {
if constexpr (!SAME_COEF_PER_CHANNEL) {
b0 = coefs[0];
b1 = coefs[localStride];
b2 = coefs[2 * localStride];
negativeA1 = -coefs[3 * localStride];
negativeA2 = -coefs[4 * localStride];
++coefs;
}
D s1n1 = delays[0];
D s2n1 = delays[localStride];
const D *input = &in[i];
D *output = &out[i];
for (size_t j = frames; j > 0; --j) {
// Adding a delta to avoid subnormal exception handling on the x86/x64 platform;
// this is not a problem with the ARM platform. The delta will not affect the
// precision of the result.
#if defined(__i386__) || defined(__x86_x64__)
const D xn = *input + delta;
#else
const D xn = *input;
#endif
D yn = (OCCUPANCY >> 0 & 1) * b0 * xn + s1n1;
s1n1 = (OCCUPANCY >> 1 & 1) * b1 * xn + (OCCUPANCY >> 3 & 1) * negativeA1 * yn + s2n1;
s2n1 = (OCCUPANCY >> 2 & 1) * b2 * xn + (OCCUPANCY >> 4 & 1) * negativeA2 * yn;
input += stride;
*output = yn;
output += stride;
#if defined(__i386__) || defined(__x86_x64__)
delta = -delta;
#endif
}
delays[0] = s1n1;
delays[localStride] = s2n1;
++delays;
}
}
// Helper function to zero channels in the input buffer.
// This is used for the degenerate coefficient case which results in all zeroes.
template <typename D>
void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) {
if (stride == channelCount) {
memset(out, 0, sizeof(float) * frames * channelCount);
} else {
for (size_t i = 0; i < frames; i++) {
memset(out, 0, sizeof(float) * channelCount);
out += stride;
}
}
}
template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
void biquad_filter_fast(D *out, const D *in, size_t frames, size_t stride,
size_t channelCount, D *delays, const D *coefs, size_t localStride) {
if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
zeroChannels(out, frames, stride, channelCount);
return;
}
biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>(
out, in, frames, stride, channelCount, delays, coefs, localStride);
}
#ifdef USE_NEON
template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
void biquad_filter_neon_impl(F *out, const F *in, size_t frames, size_t stride,
size_t channelCount, F *delays, const F *coefs, size_t localStride) {
using namespace android::audio_utils::intrinsics;
constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
T b0, b1, b2, negativeA1, negativeA2;
if constexpr (SAME_COEF_PER_CHANNEL) {
b0 = vdupn<T>(coefs[0]);
b1 = vdupn<T>(coefs[1]);
b2 = vdupn<T>(coefs[2]);
negativeA1 = vneg(vdupn<T>(coefs[3]));
negativeA2 = vneg(vdupn<T>(coefs[4]));
}
for (size_t i = 0; i < channelCount; i += elements) {
if constexpr (!SAME_COEF_PER_CHANNEL) {
b0 = vld1<T>(coefs);
b1 = vld1<T>(coefs + localStride);
b2 = vld1<T>(coefs + localStride * 2);
negativeA1 = vneg(vld1<T>(coefs + localStride * 3));
negativeA2 = vneg(vld1<T>(coefs + localStride * 4));
coefs += elements;
}
T s1 = vld1<T>(&delays[0]);
T s2 = vld1<T>(&delays[localStride]);
const F *input = &in[i];
F *output = &out[i];
for (size_t j = frames; j > 0; --j) {
T xn = vld1<T>(input);
T yn = s1;
if constexpr (OCCUPANCY >> 0 & 1) {
yn = vmla(yn, b0, xn);
}
s1 = s2;
if constexpr (OCCUPANCY >> 3 & 1) {
s1 = vmla(s1, negativeA1, yn);
}
if constexpr (OCCUPANCY >> 1 & 1) {
s1 = vmla(s1, b1, xn);
}
if constexpr (OCCUPANCY >> 2 & 1) {
s2 = vmul(b2, xn);
} else {
s2 = vdupn<T>(0.f);
}
if constexpr (OCCUPANCY >> 4 & 1) {
s2 = vmla(s2, negativeA2, yn);
}
input += stride;
vst1(output, yn);
output += stride;
}
vst1(&delays[0], s1);
vst1(&delays[localStride], s2);
delays += elements;
}
}
#define BIQUAD_FILTER_CASE(N, ... /* type */) \
case N: { \
biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, __VA_ARGS__>( \
out + offset, in + offset, frames, stride, remaining, \
delays + offset, c, localStride); \
goto exit; \
}
template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride,
size_t channelCount, D *delays, const D *coefs, size_t localStride) {
if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
zeroChannels(out, frames, stride, channelCount);
return;
}
// Possible alternative intrinsic types for 2, 9, 15 float elements.
// using alt_2_t = struct {struct { float a; float b; } s; };
// using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
// using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
for (size_t offset = 0; offset < channelCount; ) {
size_t remaining = channelCount - offset;
auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
if constexpr (std::is_same_v<D, float>) {
switch (remaining) {
default:
if (remaining >= 16) {
remaining &= ~15;
biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, float32x4x4_t>(
out + offset, in + offset, frames, stride, remaining,
delays + offset, c, localStride);
offset += remaining;
continue;
}
break; // case 1 handled at bottom.
BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
// We choose the NEON intrinsic type over internal_array for 8 to
// check if there is any performance difference in benchmark (should be similar).
// BIQUAD_FILTER_CASE(8, intrinsics::internal_array_t<float, 8>)
BIQUAD_FILTER_CASE(8, float32x4x2_t)
BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
BIQUAD_FILTER_CASE(4, float32x4_t)
// We choose the NEON intrinsic type over internal_array for 4 to
// check if there is any performance difference in benchmark (should be similar).
// BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<float, 4>)
BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
}
} else if constexpr (std::is_same_v<D, double>) {
#if defined(__aarch64__)
switch (remaining) {
default:
if (remaining >= 8) {
remaining &= ~7;
biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL,
intrinsics::internal_array_t<double, 8>>(
out + offset, in + offset, frames, stride, remaining,
delays + offset, c, localStride);
offset += remaining;
continue;
}
break; // case 1 handled at bottom.
BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
};
#endif
}
// Essentially the code below is scalar, the same as
// biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
// but formulated with NEON intrinsic-like call pattern.
biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, D>(
out + offset, in + offset, frames, stride, remaining,
delays + offset, c, localStride);
offset += remaining;
}
exit:;
}
#endif // USE_NEON
} // namespace details
/**
* BiquadFilter
*
* A multichannel Biquad filter implementation of the following transfer function.
*
* \f[
* H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
* { 1 + a_1 z^{-1} + a_2 z^{-2} }
* \f]
*
* <!--
* b_0 + b_1 z^{-1} + b_2 z^{-2}
* H(z)= -----------------------------
* 1 + a_1 z^{-1} + a_2 z^{-2}
* -->
*
* Details:
* 1. The transposed direct type 2 implementation allows zeros to be computed
* before poles in the internal state for improved filter precision and
* better time-varying coefficient performance.
* 2. We optimize for zero coefficients using a compile-time generated function table.
* 3. We optimize for vector operations using column vector operations with stride
* into interleaved audio data.
* 4. The denominator coefficients a_1 and a_2 are stored in positive form, despite the
* negated form being slightly simpler for optimization (addition is commutative but
* subtraction is not commutative). This is to permit obtaining the coefficients
* as a const reference.
*
* Compatibility issue: Some Biquad libraries store the denominator coefficients
* in negated form. We explicitly negate before entering into the inner loop.
* 5. The full 6 coefficient Biquad filter form with a_0 != 1 may be used for setting
* coefficients. See setCoefficients() below.
*
* If SAME_COEFFICIENTS_PER_CHANNEL is false, then mCoefs is stored interleaved by channel.
*
* The Biquad filter update equation in transposed Direct form 2 is as follows:
*
* \f{eqnarray*}{
* y[n] &=& b0 * x[n] + s1[n - 1] \\
* s1[n] &=& s2[n - 1] + b1 * x[n] - a1 * y[n] \\
* s2[n] &=& b2 * x[n] - a2 * y[n]
* \f}
*
* For the transposed Direct form 2 update equation s1 and s2 represent the delay state
* contained in the internal vector mDelays[]. This is stored interleaved by channel.
*
* Use -ffast-math` to permit associative math optimizations to get non-zero optimization as
* we do not rely on strict C operator precedence and associativity here.
* TODO(b/159373530): Use compound statement scoped pragmas instead of `-ffast-math`.
*
* \param D type variable representing the data type, one of float or double.
* The default is float.
* \param SAME_COEF_PER_CHANNEL bool which is true if all the Biquad coefficients
* are shared between channels, or false if the Biquad coefficients
* may differ between channels. The default is true.
*/
template <typename D = float, bool SAME_COEF_PER_CHANNEL = true>
class BiquadFilter {
public:
template <typename T = std::array<D, kBiquadNumCoefs>>
explicit BiquadFilter(size_t channelCount,
const T& coefs = {}, bool optimized = true)
: mChannelCount(channelCount)
, mCoefs(kBiquadNumCoefs * (SAME_COEF_PER_CHANNEL ? 1 : mChannelCount))
, mDelays(channelCount * kBiquadNumDelays) {
setCoefficients(coefs, optimized);
}
// copy constructors
BiquadFilter(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
*this = other;
}
BiquadFilter(BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
*this = std::move(other);
}
// copy assignment
BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
mChannelCount = other.mChannelCount;
mCoefs = other.mCoefs;
mDelays = other.mDelays;
return *this;
}
BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
mChannelCount = other.mChannelCount;
mCoefs = std::move(other.mCoefs);
mDelays = std::move(other.mDelays);
return *this;
}
// operator overloads for equality tests
bool operator==(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
return mChannelCount == other.mChannelCount
&& mCoefs == other.mCoefs
&& mDelays == other.mDelays;
}
bool operator!=(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
return !operator==(other);
}
/**
* \brief Sets filter coefficients
*
* \param coefs pointer to the filter coefficients array.
* \param optimized whether to use processor optimized function (optional, defaults true).
* \return true if the BiquadFilter is stable, otherwise, return false.
*
* The input coefficients are interpreted in the following manner:
*
* If size of container is 5 (normalized Biquad):
* coefs[0] is b0,
* coefs[1] is b1,
* coefs[2] is b2,
* coefs[3] is a1,
* coefs[4] is a2.
*
* \f[
* H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
* { 1 + a_1 z^{-1} + a_2 z^{-2} }
* \f]
* <!--
* b_0 + b_1 z^{-1} + b_2 z^{-2}
* H(z)= -----------------------------
* 1 + a_1 z^{-1} + a_2 z^{-2}
* -->
*
* If size of container is 6 (general Biquad):
* coefs[0] is b0,
* coefs[1] is b1,
* coefs[2] is b2,
* coefs[3] is a0,
* coefs[4] is a1,
* coefs[5] is a2.
*
* \f[
* H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
* { a_0 + a_1 z^{-1} + a_2 z^{-2} }
* \f]
* <!--
* b_0 + b_1 z^{-1} + b_2 z^{-2}
* H(z)= -----------------------------
* a_0 + a_1 z^{-1} + a_2 z^{-2}
* -->
*
* The internal representation is a normalized Biquad.
*/
template <typename T = std::array<D, kBiquadNumCoefs>>
bool setCoefficients(const T& coefs, bool optimized = true) {
if constexpr (SAME_COEF_PER_CHANNEL) {
details::setCoefficients<D, T>(
mCoefs, 0 /* offset */, 1 /* stride */, 1 /* channelCount */, coefs);
} else {
if (coefs.size() == mCoefs.size()) {
std::copy(coefs.begin(), coefs.end(), mCoefs.begin());
} else {
details::setCoefficients<D, T>(
mCoefs, 0 /* offset */, mChannelCount, mChannelCount, coefs);
}
}
setOptimization(optimized);
return isStable();
}
/**
* Sets coefficients for one of the filter channels, specified by channelIndex.
*
* This method is only available if SAME_COEF_PER_CHANNEL is false.
*
* \param coefs the coefficients to set.
* \param channelIndex the particular channel index to set.
* \param optimized whether to use optimized function (optional, defaults true).
*/
template <typename T = std::array<D, kBiquadNumCoefs>>
bool setCoefficients(const T& coefs, size_t channelIndex, bool optimized = true) {
static_assert(!SAME_COEF_PER_CHANNEL);
details::setCoefficients<D, T>(
mCoefs, channelIndex, mChannelCount, 1 /* channelCount */, coefs);
setOptimization(optimized);
return isStable();
}
/**
* Returns the coefficients as a const vector reference.
*
* If multichannel and the template variable SAME_COEF_PER_CHANNEL is true,
* the coefficients are interleaved by channel.
*/
const std::vector<D>& getCoefficients() const {
return mCoefs;
}
/**
* Returns true if the filter is stable.
*
* \param channelIndex ignored if SAME_COEF_PER_CHANNEL is true,
* asserts if channelIndex >= channel count (zero based index).
*/
bool isStable(size_t channelIndex = 0) const {
if constexpr (SAME_COEF_PER_CHANNEL) {
return details::isStable(mCoefs[3], mCoefs[4]);
} else {
assert(channelIndex < mChannelCount);
return details::isStable(
mCoefs[3 * mChannelCount + channelIndex],
mCoefs[4 * mChannelCount + channelIndex]);
}
}
/**
* Updates the filter function based on processor optimization.
*
* \param optimized if true, enables Processor based optimization.
*/
void setOptimization(bool optimized) {
// Determine which coefficients are nonzero as a bit field.
size_t category = 0;
for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
if constexpr (SAME_COEF_PER_CHANNEL) {
category |= (mCoefs[i] != 0) << i;
} else {
for (size_t j = 0; j < mChannelCount; ++j) {
if (mCoefs[i * mChannelCount + j] != 0) {
category |= 1 << i;
break;
}
}
}
}
// Select the proper filtering function from our array.
(void)optimized; // avoid unused variable warning.
mFunc = mFilterFast[category]; // default if we don't have processor optimization.
#ifdef USE_NEON
/* if constexpr (std::is_same_v<D, float>) */ {
if (optimized) {
mFunc = mFilterNeon[category];
}
}
#endif
}
/**
* \brief Filters the input data
*
* \param out pointer to the output data
* \param in pointer to the input data
* \param frames number of audio frames to be processed
*/
void process(D* out, const D *in, size_t frames) {
process(out, in, frames, mChannelCount);
}
/**
* \brief Filters the input data with stride
*
* \param out pointer to the output data
* \param in pointer to the input data
* \param frames number of audio frames to be processed
* \param stride the total number of samples associated with a frame, if not channelCount.
*/
void process(D* out, const D *in, size_t frames, size_t stride) {
assert(stride >= mChannelCount);
mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
mCoefs.data(), mChannelCount);
}
/**
* EXPERIMENTAL:
* Processes 1D input data, with mChannel Biquads, using sliding window parallelism.
*
* Instead of considering mChannel Biquads as one-per-input channel, this method treats
* the mChannel biquads as applied in sequence to a single 1D input stream,
* with the last channel count Biquad being applied first.
*
* input audio data -> BQ_{n-1} -> BQ{n-2} -> BQ_{n-3} -> BQ_{0} -> output
*
* TODO: Make this code efficient for NEON and split the destination from the source.
*
* Theoretically this code should be much faster for 1D input if one has 4+ Biquads to be
* sequentially applied, but in practice it is *MUCH* slower.
* On NEON, the data cannot be written then read in-place without incurring
* memory stall penalties. A shifting NEON holding register is required to make this
* a practical improvement.
*/
void process1D(D* inout, size_t frames) {
size_t remaining = mChannelCount;
#ifdef USE_NEON
// We apply NEON acceleration striped with 4 filters (channels) at once.
// Filters operations commute, nevertheless we apply the filters in order.
if (frames >= 2 * mChannelCount) {
constexpr size_t channelBlock = 4;
for (; remaining >= channelBlock; remaining -= channelBlock) {
const size_t baseIdx = remaining - channelBlock;
// This is the 1D accelerated method.
// prime the data pipe.
for (size_t i = 0; i < channelBlock - 1; ++i) {
size_t fromEnd = remaining - i - 1;
auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
auto delays = mDelays.data() + fromEnd;
mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
delays, coefs, mChannelCount);
}
auto delays = mDelays.data() + baseIdx;
auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : baseIdx);
// Parallel processing - we use a sliding window doing channelBlock at once,
// sliding one audio sample at a time.
mFunc(inout, inout,
frames - channelBlock + 1, 1 /* stride */, channelBlock,
delays, coefs, mChannelCount);
// drain data pipe.
for (size_t i = 1; i < channelBlock; ++i) {
mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
1 /* frames */, 1 /* stride */, channelBlock - i,
delays, coefs, mChannelCount);
}
}
}
#endif
// For short data sequences, we use the serial single channel logical equivalent
for (; remaining > 0; --remaining) {
size_t fromEnd = remaining - 1;
auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
mFunc(inout, inout,
frames, 1 /* stride */, 1 /* channelCount */,
mDelays.data() + fromEnd, coefs, mChannelCount);
}
}
/**
* \brief Clears the delay elements
*
* This function clears the delay elements representing the filter state.
*/
void clear() {
std::fill(mDelays.begin(), mDelays.end(), 0.f);
}
/**
* \brief Sets the internal delays from a vector
*
* For a multichannel stream, the delays are interleaved by channel:
* delays[2 * i + 0] is s1 of i-th channel,
* delays[2 * i + 1] is s2 of i-th channel,
* where index i runs from 0 to (mChannelCount - 1).
*
* \param delays reference to vector containing delays.
*/
void setDelays(std::vector<D>& delays) {
assert(delays.size() == mDelays.size());
mDelays = std::move(delays);
}
/**
* \brief Gets delay elements as a vector
*
* For a multichannel stream, the delays are interleaved by channel:
* delays[2 * i + 0] is s1 of i-th channel,
* delays[2 * i + 1] is s2 of i-th channel,
* where index i runs from 0 to (mChannelCount - 1).
*
* \return a const vector reference of delays.
*/
const std::vector<D>& getDelays() const {
return mDelays;
}
private:
/* const */ size_t mChannelCount; // not const because we can assign to it on operator equals.
/*
* \var D mCoefs
* \brief Stores the filter coefficients
*
* If SAME_COEF_PER_CHANNEL is false, the filter coefficients are stored
* interleaved by channel.
*/
std::vector<D> mCoefs;
/**
* \var D mDelays
* \brief The delay state.
*
* The delays are stored channel interleaved in the following manner,
* mDelays[2 * i + 0] is s1 of i-th channel
* mDelays[2 * i + 1] is s2 of i-th channel
* index i runs from 0 to (mChannelCount - 1).
*/
std::vector<D> mDelays;
using filter_func = decltype(details::biquad_filter_fast<0, true, D>);
/**
* \var filter_func* mFunc
*
* The current filter function selected for the channel occupancy of the Biquad.
*/
filter_func *mFunc;
// Create a functional wrapper to feed "biquad_filter_fast" to
// make_functional_array() to populate the array.
//
// OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
// b0 b1 b2 a1 a2 (from lsb to msb)
template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
struct FuncWrap {
template<typename T>
static constexpr size_t nearest() {
// Combine cases to both improve expected performance and reduce code space.
// Some occupancy masks provide worse performance than more occupied masks.
constexpr size_t required_occupancies[] = {
1, // constant scale
3, // single zero
7, // double zero
9, // single pole
// 11, // first order IIR (unnecessary optimization, close enough to 31).
27, // double pole + single zero
31, // second order IIR (full Biquad)
};
if constexpr (OCCUPANCY < 32) {
for (auto test : required_occupancies) {
if ((OCCUPANCY & test) == OCCUPANCY) return test;
}
} else {
static_assert(intrinsics::dependent_false_v<T>);
}
return 0; // never gets here.
}
static void func(D* out, const D *in, size_t frames, size_t stride,
size_t channelCount, D *delays, const D *coef, size_t localStride) {
constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
details::biquad_filter_fast<NEAREST_OCCUPANCY, SC>(
out, in, frames, stride, channelCount, delays, coef, localStride);
}
};
/**
* \var mFilterFast
*
* std::array of functions based on coefficient occupancy.
*
* static inline constexpr std::array<filter_func*, M> mArray = {
* biquad_filter_fast<0>,
* biquad_filter_fast<1>,
* biquad_filter_fast<2>,
* ...
* biquad_filter_fast<(1 << kBiquadNumCoefs) - 1>,
* };
*
* Every time the coefficients are changed, we select the processing function from
* this table.
*/
static inline constexpr auto mFilterFast =
details::make_functional_array<
FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
#ifdef USE_NEON
// OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
// b0 b1 b2 a1 a2 (from lsb to msb)
template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
struct FuncWrapNeon {
template<typename T>
static constexpr size_t nearest() {
// combine cases to both improve expected performance and reduce code space.
//
// This lists the occupancies we will specialize functions for.
constexpr size_t required_occupancies[] = {
1, // constant scale
3, // single zero
7, // double zero
9, // single pole
11, // first order IIR
27, // double pole + single zero
31, // second order IIR (full Biquad)
};
if constexpr (OCCUPANCY < 32) {
for (auto test : required_occupancies) {
if ((OCCUPANCY & test) == OCCUPANCY) return test;
}
} else {
static_assert(intrinsics::dependent_false_v<T>);
}
return 0; // never gets here.
}
static void func(D* out, const D *in, size_t frames, size_t stride,
size_t channelCount, D *delays, const D *coef, size_t localStride) {
constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
details::biquad_filter_neon<NEAREST_OCCUPANCY, SC>(
out, in, frames, stride, channelCount, delays, coef, localStride);
}
};
// Neon optimized array of functions.
static inline constexpr auto mFilterNeon =
details::make_functional_array<
FuncWrapNeon, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
#endif // USE_NEON
};
} // namespace android::audio_utils
#pragma pop_macro("USE_NEON")
#endif // !ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H