audio_utils: Biquad refactorization am: d96c30322a

Original change: https://android-review.googlesource.com/c/platform/system/media/+/1870313

Change-Id: I8c30d8501fdee42fe6e766ed8077f9b691541829
diff --git a/audio_utils/benchmarks/biquad_filter_benchmark.cpp b/audio_utils/benchmarks/biquad_filter_benchmark.cpp
index 35aa839..b0f6f6a 100644
--- a/audio_utils/benchmarks/biquad_filter_benchmark.cpp
+++ b/audio_utils/benchmarks/biquad_filter_benchmark.cpp
@@ -85,320 +85,325 @@
 
 /*******************************************************************
  A test result running on Pixel 4XL for comparison.
- The first parameter indicates the input data is subnormal or not.
- 0 for normal input data, 1 for subnormal input data.
- The second parameter indicates the channel count.
- The third parameter indicates the occupancy of the coefficients.
+
+ Parameterized Test BM_BiquadFilter1D/A
+ <A> is 0 or 1 indicating if the input data is subnormal or not.
+
+ Parameterized Test BM_BiquadFilter<TYPE>/A/B/C
+ <A> is 0 or 1 indicating if the input data is subnormal or not.
+ <B> is the channel count, starting from 1
+ <C> indicates the occupancy of the coefficients as a bitmask (1 - 31) representing
+     b0, b1, b2, a0, a1.  31 indicates all Biquad coefficients are non-zero.
 
 -----------------------------------------------------------------------------------
 Benchmark                                         Time             CPU   Iterations
 -----------------------------------------------------------------------------------
-BM_BiquadFilter1D/0                             556 ns          555 ns      1263112
-BM_BiquadFilter1D/1                             560 ns          558 ns      1253287
-BM_BiquadFilterFloatOptimized/0/1/31           2178 ns         2172 ns       322245
-BM_BiquadFilterFloatOptimized/0/2/31           5013 ns         4999 ns       140023
-BM_BiquadFilterFloatOptimized/0/3/31           4938 ns         4924 ns       142153
-BM_BiquadFilterFloatOptimized/0/4/31           4996 ns         4981 ns       140506
-BM_BiquadFilterFloatOptimized/0/5/31           4931 ns         4917 ns       142358
-BM_BiquadFilterFloatOptimized/0/6/31           5222 ns         5208 ns       134401
-BM_BiquadFilterFloatOptimized/0/7/31           4694 ns         4681 ns       149552
-BM_BiquadFilterFloatOptimized/0/8/31           5174 ns         5159 ns       135656
-BM_BiquadFilterFloatOptimized/0/9/31           5604 ns         5589 ns       125174
-BM_BiquadFilterFloatOptimized/0/10/31          6136 ns         6118 ns       114547
-BM_BiquadFilterFloatOptimized/0/11/31          6080 ns         6065 ns       115425
-BM_BiquadFilterFloatOptimized/0/12/31          6114 ns         6098 ns       114790
-BM_BiquadFilterFloatOptimized/0/13/31          7247 ns         7229 ns        96798
-BM_BiquadFilterFloatOptimized/0/14/31          7539 ns         7515 ns        93137
-BM_BiquadFilterFloatOptimized/0/15/31         12787 ns        12748 ns        55041
-BM_BiquadFilterFloatOptimized/0/16/31          7493 ns         7470 ns        93688
-BM_BiquadFilterFloatOptimized/0/17/31          9797 ns         9766 ns        71597
-BM_BiquadFilterFloatOptimized/0/18/31         12563 ns        12524 ns        55862
-BM_BiquadFilterFloatOptimized/0/19/31         12560 ns        12521 ns        55846
-BM_BiquadFilterFloatOptimized/0/20/31         12560 ns        12523 ns        55926
-BM_BiquadFilterFloatOptimized/0/21/31         12576 ns        12543 ns        55795
-BM_BiquadFilterFloatOptimized/0/22/31         12881 ns        12845 ns        54408
-BM_BiquadFilterFloatOptimized/0/23/31         12681 ns        12635 ns        55410
-BM_BiquadFilterFloatOptimized/0/24/31         12749 ns        12712 ns        55041
-BM_BiquadFilterFloatOptimized/0/1/1             557 ns          555 ns      1260939
-BM_BiquadFilterFloatOptimized/0/1/2             652 ns          650 ns      1077181
-BM_BiquadFilterFloatOptimized/0/1/3             652 ns          650 ns      1077352
-BM_BiquadFilterFloatOptimized/0/1/4             833 ns          831 ns       840290
-BM_BiquadFilterFloatOptimized/0/1/5             835 ns          833 ns       840171
-BM_BiquadFilterFloatOptimized/0/1/6             836 ns          833 ns       840106
-BM_BiquadFilterFloatOptimized/0/1/7             835 ns          832 ns       840200
-BM_BiquadFilterFloatOptimized/0/1/8            1813 ns         1808 ns       387100
-BM_BiquadFilterFloatOptimized/0/1/9            1813 ns         1808 ns       387152
-BM_BiquadFilterFloatOptimized/0/1/10           2552 ns         2544 ns       275176
-BM_BiquadFilterFloatOptimized/0/1/11           2551 ns         2544 ns       275192
-BM_BiquadFilterFloatOptimized/0/1/12           2178 ns         2172 ns       322335
-BM_BiquadFilterFloatOptimized/0/1/13           2179 ns         2172 ns       322286
-BM_BiquadFilterFloatOptimized/0/1/14           2178 ns         2172 ns       322252
-BM_BiquadFilterFloatOptimized/0/1/15           2178 ns         2172 ns       322285
-BM_BiquadFilterFloatOptimized/0/1/16           2175 ns         2169 ns       322716
-BM_BiquadFilterFloatOptimized/0/1/17           2174 ns         2169 ns       322730
-BM_BiquadFilterFloatOptimized/0/1/18           2175 ns         2169 ns       322719
-BM_BiquadFilterFloatOptimized/0/1/19           2175 ns         2169 ns       322741
-BM_BiquadFilterFloatOptimized/0/1/20           2178 ns         2172 ns       322336
-BM_BiquadFilterFloatOptimized/0/1/21           2178 ns         2172 ns       322315
-BM_BiquadFilterFloatOptimized/0/1/22           2178 ns         2172 ns       322328
-BM_BiquadFilterFloatOptimized/0/1/23           2178 ns         2172 ns       322306
-BM_BiquadFilterFloatOptimized/0/1/24           2175 ns         2169 ns       322752
-BM_BiquadFilterFloatOptimized/0/1/25           2174 ns         2169 ns       322721
-BM_BiquadFilterFloatOptimized/0/1/26           2174 ns         2169 ns       322722
-BM_BiquadFilterFloatOptimized/0/1/27           2175 ns         2169 ns       322704
-BM_BiquadFilterFloatOptimized/0/1/28           2178 ns         2172 ns       322317
-BM_BiquadFilterFloatOptimized/0/1/29           2178 ns         2172 ns       322308
-BM_BiquadFilterFloatOptimized/0/1/30           2179 ns         2172 ns       322300
-BM_BiquadFilterFloatOptimized/0/1/31           2178 ns         2172 ns       322271
-BM_BiquadFilterFloatOptimized/0/2/1             737 ns          734 ns       953033
-BM_BiquadFilterFloatOptimized/0/2/2            1085 ns         1082 ns       647110
-BM_BiquadFilterFloatOptimized/0/2/3            1085 ns         1082 ns       646630
-BM_BiquadFilterFloatOptimized/0/2/4            1538 ns         1534 ns       456015
-BM_BiquadFilterFloatOptimized/0/2/5            1536 ns         1532 ns       456137
-BM_BiquadFilterFloatOptimized/0/2/6            1537 ns         1532 ns       456168
-BM_BiquadFilterFloatOptimized/0/2/7            1536 ns         1532 ns       456982
-BM_BiquadFilterFloatOptimized/0/2/8            1974 ns         1969 ns       355506
-BM_BiquadFilterFloatOptimized/0/2/9            1974 ns         1969 ns       355489
-BM_BiquadFilterFloatOptimized/0/2/10           4345 ns         4333 ns       161562
-BM_BiquadFilterFloatOptimized/0/2/11           4344 ns         4332 ns       161564
-BM_BiquadFilterFloatOptimized/0/2/12           5014 ns         4999 ns       140035
-BM_BiquadFilterFloatOptimized/0/2/13           5014 ns         4999 ns       139958
-BM_BiquadFilterFloatOptimized/0/2/14           5012 ns         4999 ns       139996
-BM_BiquadFilterFloatOptimized/0/2/15           5013 ns         4999 ns       140021
-BM_BiquadFilterFloatOptimized/0/2/16           3985 ns         3973 ns       176193
-BM_BiquadFilterFloatOptimized/0/2/17           3984 ns         3973 ns       176178
-BM_BiquadFilterFloatOptimized/0/2/18           3984 ns         3973 ns       176178
-BM_BiquadFilterFloatOptimized/0/2/19           3984 ns         3973 ns       176179
-BM_BiquadFilterFloatOptimized/0/2/20           5013 ns         4999 ns       140011
-BM_BiquadFilterFloatOptimized/0/2/21           5013 ns         4999 ns       140042
-BM_BiquadFilterFloatOptimized/0/2/22           5012 ns         4999 ns       140027
-BM_BiquadFilterFloatOptimized/0/2/23           5011 ns         4999 ns       140028
-BM_BiquadFilterFloatOptimized/0/2/24           3984 ns         3973 ns       176189
-BM_BiquadFilterFloatOptimized/0/2/25           3984 ns         3973 ns       176199
-BM_BiquadFilterFloatOptimized/0/2/26           3979 ns         3971 ns       176263
-BM_BiquadFilterFloatOptimized/0/2/27           3984 ns         3973 ns       176206
-BM_BiquadFilterFloatOptimized/0/2/28           5013 ns         4999 ns       140019
-BM_BiquadFilterFloatOptimized/0/2/29           5013 ns         4999 ns       140032
-BM_BiquadFilterFloatOptimized/0/2/30           5013 ns         4999 ns       140031
-BM_BiquadFilterFloatOptimized/0/2/31           5012 ns         4999 ns       140021
-BM_BiquadFilterFloatOptimized/0/3/1            1010 ns         1007 ns       695238
-BM_BiquadFilterFloatOptimized/0/3/2            1760 ns         1755 ns       409554
-BM_BiquadFilterFloatOptimized/0/3/3            1750 ns         1745 ns       391924
-BM_BiquadFilterFloatOptimized/0/3/4            2315 ns         2308 ns       303349
-BM_BiquadFilterFloatOptimized/0/3/5            2315 ns         2309 ns       303177
-BM_BiquadFilterFloatOptimized/0/3/6            2316 ns         2309 ns       303026
-BM_BiquadFilterFloatOptimized/0/3/7            2315 ns         2309 ns       303133
-BM_BiquadFilterFloatOptimized/0/3/8            3052 ns         3044 ns       229836
-BM_BiquadFilterFloatOptimized/0/3/9            3052 ns         3044 ns       229888
-BM_BiquadFilterFloatOptimized/0/3/10           4345 ns         4333 ns       161546
-BM_BiquadFilterFloatOptimized/0/3/11           4344 ns         4333 ns       161549
-BM_BiquadFilterFloatOptimized/0/3/12           4937 ns         4924 ns       142178
-BM_BiquadFilterFloatOptimized/0/3/13           4933 ns         4923 ns       142166
-BM_BiquadFilterFloatOptimized/0/3/14           4937 ns         4924 ns       142174
-BM_BiquadFilterFloatOptimized/0/3/15           4937 ns         4924 ns       142139
-BM_BiquadFilterFloatOptimized/0/3/16           4068 ns         4058 ns       172507
-BM_BiquadFilterFloatOptimized/0/3/17           4068 ns         4057 ns       172495
-BM_BiquadFilterFloatOptimized/0/3/18           4069 ns         4058 ns       172509
-BM_BiquadFilterFloatOptimized/0/3/19           4070 ns         4059 ns       172495
-BM_BiquadFilterFloatOptimized/0/3/20           4937 ns         4924 ns       142161
-BM_BiquadFilterFloatOptimized/0/3/21           4937 ns         4924 ns       142171
-BM_BiquadFilterFloatOptimized/0/3/22           4937 ns         4923 ns       142172
-BM_BiquadFilterFloatOptimized/0/3/23           4938 ns         4924 ns       142191
-BM_BiquadFilterFloatOptimized/0/3/24           4072 ns         4058 ns       172484
-BM_BiquadFilterFloatOptimized/0/3/25           4070 ns         4058 ns       172532
-BM_BiquadFilterFloatOptimized/0/3/26           4068 ns         4058 ns       172543
-BM_BiquadFilterFloatOptimized/0/3/27           4069 ns         4058 ns       172503
-BM_BiquadFilterFloatOptimized/0/3/28           4937 ns         4924 ns       142173
-BM_BiquadFilterFloatOptimized/0/3/29           4940 ns         4924 ns       142160
-BM_BiquadFilterFloatOptimized/0/3/30           4937 ns         4924 ns       142168
-BM_BiquadFilterFloatOptimized/0/3/31           4937 ns         4924 ns       142171
-BM_BiquadFilterFloatOptimized/0/4/1             555 ns          553 ns      1264721
-BM_BiquadFilterFloatOptimized/0/4/2             736 ns          734 ns       953947
-BM_BiquadFilterFloatOptimized/0/4/3             736 ns          734 ns       953825
-BM_BiquadFilterFloatOptimized/0/4/4            1357 ns         1353 ns       517334
-BM_BiquadFilterFloatOptimized/0/4/5            1357 ns         1353 ns       517339
-BM_BiquadFilterFloatOptimized/0/4/6            1357 ns         1353 ns       517307
-BM_BiquadFilterFloatOptimized/0/4/7            1357 ns         1353 ns       517153
-BM_BiquadFilterFloatOptimized/0/4/8            1901 ns         1896 ns       369069
-BM_BiquadFilterFloatOptimized/0/4/9            1902 ns         1897 ns       369100
-BM_BiquadFilterFloatOptimized/0/4/10           3984 ns         3972 ns       176207
-BM_BiquadFilterFloatOptimized/0/4/11           3984 ns         3972 ns       176209
-BM_BiquadFilterFloatOptimized/0/4/12           4998 ns         4982 ns       140517
-BM_BiquadFilterFloatOptimized/0/4/13           4996 ns         4982 ns       140523
-BM_BiquadFilterFloatOptimized/0/4/14           4996 ns         4982 ns       140527
-BM_BiquadFilterFloatOptimized/0/4/15           4995 ns         4982 ns       140510
-BM_BiquadFilterFloatOptimized/0/4/16           3984 ns         3973 ns       176180
-BM_BiquadFilterFloatOptimized/0/4/17           3985 ns         3973 ns       176195
-BM_BiquadFilterFloatOptimized/0/4/18           3985 ns         3973 ns       176206
-BM_BiquadFilterFloatOptimized/0/4/19           3984 ns         3973 ns       176193
-BM_BiquadFilterFloatOptimized/0/4/20           4999 ns         4984 ns       140465
-BM_BiquadFilterFloatOptimized/0/4/21           4997 ns         4982 ns       140518
-BM_BiquadFilterFloatOptimized/0/4/22           4997 ns         4982 ns       140541
-BM_BiquadFilterFloatOptimized/0/4/23           4995 ns         4982 ns       140518
-BM_BiquadFilterFloatOptimized/0/4/24           3984 ns         3973 ns       176197
-BM_BiquadFilterFloatOptimized/0/4/25           3983 ns         3973 ns       176182
-BM_BiquadFilterFloatOptimized/0/4/26           3984 ns         3973 ns       176193
-BM_BiquadFilterFloatOptimized/0/4/27           3985 ns         3973 ns       176205
-BM_BiquadFilterFloatOptimized/0/4/28           4997 ns         4982 ns       140507
-BM_BiquadFilterFloatOptimized/0/4/29           4996 ns         4982 ns       140515
-BM_BiquadFilterFloatOptimized/0/4/30           4996 ns         4983 ns       140517
-BM_BiquadFilterFloatOptimized/0/4/31           4998 ns         4982 ns       140519
-BM_BiquadFilterFloatOptimized/1/1/1             557 ns          555 ns      1261214
-BM_BiquadFilterFloatOptimized/1/1/2             652 ns          650 ns      1077578
-BM_BiquadFilterFloatOptimized/1/1/3             652 ns          650 ns      1077688
-BM_BiquadFilterFloatOptimized/1/1/4             834 ns          832 ns       841263
-BM_BiquadFilterFloatOptimized/1/1/5             836 ns          833 ns       840264
-BM_BiquadFilterFloatOptimized/1/1/6             836 ns          833 ns       840002
-BM_BiquadFilterFloatOptimized/1/1/7             835 ns          833 ns       840209
-BM_BiquadFilterFloatOptimized/1/1/8            1813 ns         1808 ns       387140
-BM_BiquadFilterFloatOptimized/1/1/9            1814 ns         1808 ns       387077
-BM_BiquadFilterFloatOptimized/1/1/10           2552 ns         2544 ns       275164
-BM_BiquadFilterFloatOptimized/1/1/11           2552 ns         2545 ns       275177
-BM_BiquadFilterFloatOptimized/1/1/12           2178 ns         2172 ns       322211
-BM_BiquadFilterFloatOptimized/1/1/13           2178 ns         2172 ns       322244
-BM_BiquadFilterFloatOptimized/1/1/14           2179 ns         2172 ns       322290
-BM_BiquadFilterFloatOptimized/1/1/15           2179 ns         2172 ns       322318
-BM_BiquadFilterFloatOptimized/1/1/16           2175 ns         2169 ns       322771
-BM_BiquadFilterFloatOptimized/1/1/17           2176 ns         2169 ns       322723
-BM_BiquadFilterFloatOptimized/1/1/18           2175 ns         2169 ns       322752
-BM_BiquadFilterFloatOptimized/1/1/19           2175 ns         2169 ns       322712
-BM_BiquadFilterFloatOptimized/1/1/20           2178 ns         2172 ns       322229
-BM_BiquadFilterFloatOptimized/1/1/21           2178 ns         2172 ns       322263
-BM_BiquadFilterFloatOptimized/1/1/22           2178 ns         2172 ns       322271
-BM_BiquadFilterFloatOptimized/1/1/23           2178 ns         2172 ns       322302
-BM_BiquadFilterFloatOptimized/1/1/24           2176 ns         2169 ns       322749
-BM_BiquadFilterFloatOptimized/1/1/25           2175 ns         2169 ns       322653
-BM_BiquadFilterFloatOptimized/1/1/26           2175 ns         2169 ns       322739
-BM_BiquadFilterFloatOptimized/1/1/27           2175 ns         2169 ns       322709
-BM_BiquadFilterFloatOptimized/1/1/28           2178 ns         2172 ns       322242
-BM_BiquadFilterFloatOptimized/1/1/29           2178 ns         2172 ns       322286
-BM_BiquadFilterFloatOptimized/1/1/30           2177 ns         2172 ns       322259
-BM_BiquadFilterFloatOptimized/1/1/31           2178 ns         2172 ns       322321
-BM_BiquadFilterFloatOptimized/1/2/1             737 ns          734 ns       953000
-BM_BiquadFilterFloatOptimized/1/2/2            1085 ns         1082 ns       646529
-BM_BiquadFilterFloatOptimized/1/2/3            1086 ns         1082 ns       646983
-BM_BiquadFilterFloatOptimized/1/2/4            1537 ns         1533 ns       456082
-BM_BiquadFilterFloatOptimized/1/2/5            1538 ns         1533 ns       457062
-BM_BiquadFilterFloatOptimized/1/2/6            1539 ns         1534 ns       457137
-BM_BiquadFilterFloatOptimized/1/2/7            1539 ns         1534 ns       457042
-BM_BiquadFilterFloatOptimized/1/2/8            1975 ns         1969 ns       355538
-BM_BiquadFilterFloatOptimized/1/2/9            1975 ns         1969 ns       355560
-BM_BiquadFilterFloatOptimized/1/2/10           4347 ns         4333 ns       161568
-BM_BiquadFilterFloatOptimized/1/2/11           4345 ns         4333 ns       161551
-BM_BiquadFilterFloatOptimized/1/2/12           5014 ns         4999 ns       139998
-BM_BiquadFilterFloatOptimized/1/2/13           5014 ns         4999 ns       140001
-BM_BiquadFilterFloatOptimized/1/2/14           5016 ns         5000 ns       140022
-BM_BiquadFilterFloatOptimized/1/2/15           5013 ns         4999 ns       140019
-BM_BiquadFilterFloatOptimized/1/2/16           3986 ns         3973 ns       176177
-BM_BiquadFilterFloatOptimized/1/2/17           3985 ns         3973 ns       176194
-BM_BiquadFilterFloatOptimized/1/2/18           3984 ns         3973 ns       176174
-BM_BiquadFilterFloatOptimized/1/2/19           3984 ns         3973 ns       176167
-BM_BiquadFilterFloatOptimized/1/2/20           5012 ns         4999 ns       140029
-BM_BiquadFilterFloatOptimized/1/2/21           5014 ns         4999 ns       140026
-BM_BiquadFilterFloatOptimized/1/2/22           5013 ns         4999 ns       140013
-BM_BiquadFilterFloatOptimized/1/2/23           5014 ns         5000 ns       139998
-BM_BiquadFilterFloatOptimized/1/2/24           3986 ns         3973 ns       176163
-BM_BiquadFilterFloatOptimized/1/2/25           3984 ns         3973 ns       176201
-BM_BiquadFilterFloatOptimized/1/2/26           3983 ns         3973 ns       176186
-BM_BiquadFilterFloatOptimized/1/2/27           3986 ns         3973 ns       176174
-BM_BiquadFilterFloatOptimized/1/2/28           5013 ns         4999 ns       140001
-BM_BiquadFilterFloatOptimized/1/2/29           5014 ns         4999 ns       140033
-BM_BiquadFilterFloatOptimized/1/2/30           5012 ns         4999 ns       140018
-BM_BiquadFilterFloatOptimized/1/2/31           5014 ns         4999 ns       140003
-BM_BiquadFilterFloatOptimized/1/3/1            1010 ns         1007 ns       695126
-BM_BiquadFilterFloatOptimized/1/3/2            1753 ns         1748 ns       401120
-BM_BiquadFilterFloatOptimized/1/3/3            1765 ns         1759 ns       403787
-BM_BiquadFilterFloatOptimized/1/3/4            2312 ns         2307 ns       303354
-BM_BiquadFilterFloatOptimized/1/3/5            2317 ns         2309 ns       303095
-BM_BiquadFilterFloatOptimized/1/3/6            2318 ns         2311 ns       302366
-BM_BiquadFilterFloatOptimized/1/3/7            2315 ns         2309 ns       303183
-BM_BiquadFilterFloatOptimized/1/3/8            3053 ns         3044 ns       229914
-BM_BiquadFilterFloatOptimized/1/3/9            3053 ns         3044 ns       229952
-BM_BiquadFilterFloatOptimized/1/3/10           4346 ns         4333 ns       161527
-BM_BiquadFilterFloatOptimized/1/3/11           4345 ns         4333 ns       161578
-BM_BiquadFilterFloatOptimized/1/3/12           4938 ns         4924 ns       142144
-BM_BiquadFilterFloatOptimized/1/3/13           4938 ns         4924 ns       142160
-BM_BiquadFilterFloatOptimized/1/3/14           4938 ns         4924 ns       142173
-BM_BiquadFilterFloatOptimized/1/3/15           4938 ns         4924 ns       142171
-BM_BiquadFilterFloatOptimized/1/3/16           4072 ns         4058 ns       172551
-BM_BiquadFilterFloatOptimized/1/3/17           4071 ns         4059 ns       172535
-BM_BiquadFilterFloatOptimized/1/3/18           4071 ns         4059 ns       172451
-BM_BiquadFilterFloatOptimized/1/3/19           4072 ns         4059 ns       172440
-BM_BiquadFilterFloatOptimized/1/3/20           4938 ns         4925 ns       142159
-BM_BiquadFilterFloatOptimized/1/3/21           4940 ns         4924 ns       142162
-BM_BiquadFilterFloatOptimized/1/3/22           4938 ns         4924 ns       142152
-BM_BiquadFilterFloatOptimized/1/3/23           4939 ns         4924 ns       142166
-BM_BiquadFilterFloatOptimized/1/3/24           4070 ns         4058 ns       172556
-BM_BiquadFilterFloatOptimized/1/3/25           4069 ns         4058 ns       172463
-BM_BiquadFilterFloatOptimized/1/3/26           4071 ns         4058 ns       172489
-BM_BiquadFilterFloatOptimized/1/3/27           4070 ns         4058 ns       172506
-BM_BiquadFilterFloatOptimized/1/3/28           4938 ns         4924 ns       142152
-BM_BiquadFilterFloatOptimized/1/3/29           4939 ns         4924 ns       142164
-BM_BiquadFilterFloatOptimized/1/3/30           4937 ns         4924 ns       142172
-BM_BiquadFilterFloatOptimized/1/3/31           4939 ns         4924 ns       142156
-BM_BiquadFilterFloatOptimized/1/4/1             555 ns          553 ns      1264784
-BM_BiquadFilterFloatOptimized/1/4/2             736 ns          734 ns       953628
-BM_BiquadFilterFloatOptimized/1/4/3             736 ns          734 ns       953966
-BM_BiquadFilterFloatOptimized/1/4/4            1357 ns         1353 ns       517294
-BM_BiquadFilterFloatOptimized/1/4/5            1357 ns         1353 ns       517252
-BM_BiquadFilterFloatOptimized/1/4/6            1357 ns         1353 ns       517358
-BM_BiquadFilterFloatOptimized/1/4/7            1357 ns         1353 ns       517367
-BM_BiquadFilterFloatOptimized/1/4/8            1902 ns         1896 ns       369039
-BM_BiquadFilterFloatOptimized/1/4/9            1903 ns         1897 ns       368999
-BM_BiquadFilterFloatOptimized/1/4/10           3984 ns         3972 ns       176223
-BM_BiquadFilterFloatOptimized/1/4/11           3985 ns         3972 ns       176227
-BM_BiquadFilterFloatOptimized/1/4/12           4996 ns         4982 ns       140498
-BM_BiquadFilterFloatOptimized/1/4/13           4996 ns         4982 ns       140514
-BM_BiquadFilterFloatOptimized/1/4/14           4995 ns         4982 ns       140497
-BM_BiquadFilterFloatOptimized/1/4/15           4995 ns         4982 ns       140514
-BM_BiquadFilterFloatOptimized/1/4/16           3984 ns         3973 ns       176199
-BM_BiquadFilterFloatOptimized/1/4/17           3984 ns         3973 ns       176183
-BM_BiquadFilterFloatOptimized/1/4/18           3985 ns         3973 ns       176198
-BM_BiquadFilterFloatOptimized/1/4/19           3986 ns         3973 ns       176194
-BM_BiquadFilterFloatOptimized/1/4/20           4998 ns         4984 ns       140422
-BM_BiquadFilterFloatOptimized/1/4/21           4997 ns         4982 ns       140519
-BM_BiquadFilterFloatOptimized/1/4/22           4995 ns         4982 ns       140514
-BM_BiquadFilterFloatOptimized/1/4/23           4996 ns         4982 ns       140516
-BM_BiquadFilterFloatOptimized/1/4/24           3984 ns         3973 ns       176184
-BM_BiquadFilterFloatOptimized/1/4/25           3983 ns         3972 ns       176191
-BM_BiquadFilterFloatOptimized/1/4/26           3985 ns         3973 ns       176189
-BM_BiquadFilterFloatOptimized/1/4/27           3985 ns         3973 ns       176195
-BM_BiquadFilterFloatOptimized/1/4/28           4996 ns         4982 ns       140504
-BM_BiquadFilterFloatOptimized/1/4/29           4996 ns         4982 ns       140513
-BM_BiquadFilterFloatOptimized/1/4/30           4995 ns         4982 ns       140510
-BM_BiquadFilterFloatOptimized/1/4/31           4997 ns         4982 ns       140504
-BM_BiquadFilterFloatNonOptimized/0/1/31        2178 ns         2172 ns       322337
-BM_BiquadFilterFloatNonOptimized/0/2/31        4353 ns         4342 ns       161208
-BM_BiquadFilterFloatNonOptimized/0/3/31        6529 ns         6509 ns       107546
-BM_BiquadFilterFloatNonOptimized/0/4/31        8700 ns         8677 ns        80685
-BM_BiquadFilterFloatNonOptimized/0/5/31       10874 ns        10844 ns        64535
-BM_BiquadFilterFloatNonOptimized/0/6/31       13072 ns        13030 ns        53723
-BM_BiquadFilterFloatNonOptimized/0/7/31       15226 ns        15184 ns        46111
-BM_BiquadFilterFloatNonOptimized/0/8/31       17416 ns        17371 ns        40292
-BM_BiquadFilterFloatNonOptimized/0/9/31       19595 ns        19545 ns        35814
-BM_BiquadFilterFloatNonOptimized/0/10/31      21774 ns        21713 ns        32242
-BM_BiquadFilterFloatNonOptimized/0/11/31      23971 ns        23908 ns        29279
-BM_BiquadFilterFloatNonOptimized/0/12/31      26170 ns        26092 ns        26825
-BM_BiquadFilterFloatNonOptimized/0/13/31      28384 ns        28304 ns        24732
-BM_BiquadFilterFloatNonOptimized/0/14/31      30585 ns        30495 ns        22956
-BM_BiquadFilterFloatNonOptimized/0/15/31      32811 ns        32724 ns        21391
-BM_BiquadFilterFloatNonOptimized/0/16/31      35082 ns        34987 ns        20007
-BM_BiquadFilterFloatNonOptimized/0/17/31      37629 ns        37527 ns        18653
-BM_BiquadFilterFloatNonOptimized/0/18/31      40442 ns        40328 ns        17366
-BM_BiquadFilterFloatNonOptimized/0/19/31      42448 ns        42335 ns        16532
-BM_BiquadFilterFloatNonOptimized/0/20/31      45171 ns        45045 ns        15536
-BM_BiquadFilterFloatNonOptimized/0/21/31      46966 ns        46835 ns        14950
-BM_BiquadFilterFloatNonOptimized/0/22/31      48604 ns        48466 ns        14449
-BM_BiquadFilterFloatNonOptimized/0/23/31      50446 ns        50294 ns        13915
-BM_BiquadFilterFloatNonOptimized/0/24/31      52667 ns        52495 ns        13339
-BM_BiquadFilterDoubleOptimized/0/1/31          2180 ns         2173 ns       322151
-BM_BiquadFilterDoubleOptimized/0/2/31          5002 ns         4987 ns       140369
-BM_BiquadFilterDoubleOptimized/0/3/31          4919 ns         4906 ns       142292
-BM_BiquadFilterDoubleOptimized/0/4/31          5225 ns         5210 ns       134286
-BM_BiquadFilterDoubleNonOptimized/0/1/31       2177 ns         2171 ns       322374
-BM_BiquadFilterDoubleNonOptimized/0/2/31       4353 ns         4341 ns       161217
-BM_BiquadFilterDoubleNonOptimized/0/3/31       6537 ns         6516 ns       107442
-BM_BiquadFilterDoubleNonOptimized/0/4/31       8715 ns         8691 ns        80545
+BM_BiquadFilter1D/0                             558 ns          556 ns      1258922
+BM_BiquadFilter1D/1                             561 ns          560 ns      1251090
+BM_BiquadFilterFloatOptimized/0/1/31           2499 ns         2493 ns       280808
+BM_BiquadFilterFloatOptimized/0/2/31           3174 ns         3166 ns       221128
+BM_BiquadFilterFloatOptimized/0/3/31           3497 ns         3487 ns       200739
+BM_BiquadFilterFloatOptimized/0/4/31           3165 ns         3157 ns       221768
+BM_BiquadFilterFloatOptimized/0/5/31           3424 ns         3415 ns       204909
+BM_BiquadFilterFloatOptimized/0/6/31           3539 ns         3530 ns       198271
+BM_BiquadFilterFloatOptimized/0/7/31           4311 ns         4300 ns       162593
+BM_BiquadFilterFloatOptimized/0/8/31           3501 ns         3492 ns       200490
+BM_BiquadFilterFloatOptimized/0/9/31           4310 ns         4299 ns       162317
+BM_BiquadFilterFloatOptimized/0/10/31          4487 ns         4476 ns       156406
+BM_BiquadFilterFloatOptimized/0/11/31          5589 ns         5575 ns       125644
+BM_BiquadFilterFloatOptimized/0/12/31          4457 ns         4445 ns       157532
+BM_BiquadFilterFloatOptimized/0/13/31          5600 ns         5586 ns       125403
+BM_BiquadFilterFloatOptimized/0/14/31          5834 ns         5819 ns       120309
+BM_BiquadFilterFloatOptimized/0/15/31          7089 ns         7070 ns        98986
+BM_BiquadFilterFloatOptimized/0/16/31          5644 ns         5627 ns       124364
+BM_BiquadFilterFloatOptimized/0/17/31          8244 ns         8223 ns        85126
+BM_BiquadFilterFloatOptimized/0/18/31          8900 ns         8874 ns        78853
+BM_BiquadFilterFloatOptimized/0/19/31          9385 ns         9360 ns        74775
+BM_BiquadFilterFloatOptimized/0/20/31          8783 ns         8760 ns        79901
+BM_BiquadFilterFloatOptimized/0/21/31          9335 ns         9305 ns        75239
+BM_BiquadFilterFloatOptimized/0/22/31          9561 ns         9535 ns        73368
+BM_BiquadFilterFloatOptimized/0/23/31         10334 ns        10307 ns        67876
+BM_BiquadFilterFloatOptimized/0/24/31          9266 ns         9241 ns        75692
+BM_BiquadFilterFloatOptimized/0/1/1             557 ns          556 ns      1259656
+BM_BiquadFilterFloatOptimized/0/1/2             651 ns          649 ns      1078575
+BM_BiquadFilterFloatOptimized/0/1/3             650 ns          648 ns      1079479
+BM_BiquadFilterFloatOptimized/0/1/4             805 ns          803 ns       918780
+BM_BiquadFilterFloatOptimized/0/1/5             984 ns          981 ns       736887
+BM_BiquadFilterFloatOptimized/0/1/6             797 ns          795 ns       882135
+BM_BiquadFilterFloatOptimized/0/1/7             792 ns          790 ns       897376
+BM_BiquadFilterFloatOptimized/0/1/8            1974 ns         1969 ns       355501
+BM_BiquadFilterFloatOptimized/0/1/9            1973 ns         1968 ns       355606
+BM_BiquadFilterFloatOptimized/0/1/10           2709 ns         2703 ns       259268
+BM_BiquadFilterFloatOptimized/0/1/11           2613 ns         2607 ns       268435
+BM_BiquadFilterFloatOptimized/0/1/12           2499 ns         2493 ns       280813
+BM_BiquadFilterFloatOptimized/0/1/13           2497 ns         2491 ns       280990
+BM_BiquadFilterFloatOptimized/0/1/14           2499 ns         2493 ns       280818
+BM_BiquadFilterFloatOptimized/0/1/15           2499 ns         2493 ns       280815
+BM_BiquadFilterFloatOptimized/0/1/16           2327 ns         2321 ns       301566
+BM_BiquadFilterFloatOptimized/0/1/17           2326 ns         2321 ns       301606
+BM_BiquadFilterFloatOptimized/0/1/18           2326 ns         2321 ns       301606
+BM_BiquadFilterFloatOptimized/0/1/19           2327 ns         2321 ns       301606
+BM_BiquadFilterFloatOptimized/0/1/20           2499 ns         2493 ns       280810
+BM_BiquadFilterFloatOptimized/0/1/21           2497 ns         2491 ns       280989
+BM_BiquadFilterFloatOptimized/0/1/22           2499 ns         2493 ns       280796
+BM_BiquadFilterFloatOptimized/0/1/23           2499 ns         2493 ns       280807
+BM_BiquadFilterFloatOptimized/0/1/24           2327 ns         2321 ns       301596
+BM_BiquadFilterFloatOptimized/0/1/25           2327 ns         2321 ns       301600
+BM_BiquadFilterFloatOptimized/0/1/26           2327 ns         2321 ns       301597
+BM_BiquadFilterFloatOptimized/0/1/27           2327 ns         2321 ns       301588
+BM_BiquadFilterFloatOptimized/0/1/28           2500 ns         2493 ns       280761
+BM_BiquadFilterFloatOptimized/0/1/29           2499 ns         2492 ns       280951
+BM_BiquadFilterFloatOptimized/0/1/30           2500 ns         2493 ns       280787
+BM_BiquadFilterFloatOptimized/0/1/31           2500 ns         2493 ns       280808
+BM_BiquadFilterFloatOptimized/0/2/1             440 ns          439 ns      1595281
+BM_BiquadFilterFloatOptimized/0/2/2             633 ns          631 ns      1108368
+BM_BiquadFilterFloatOptimized/0/2/3             633 ns          631 ns      1108778
+BM_BiquadFilterFloatOptimized/0/2/4            1523 ns         1518 ns       461120
+BM_BiquadFilterFloatOptimized/0/2/5            1523 ns         1518 ns       461075
+BM_BiquadFilterFloatOptimized/0/2/6            1522 ns         1518 ns       461059
+BM_BiquadFilterFloatOptimized/0/2/7            1523 ns         1518 ns       461068
+BM_BiquadFilterFloatOptimized/0/2/8            2854 ns         2845 ns       248471
+BM_BiquadFilterFloatOptimized/0/2/9            2809 ns         2800 ns       250019
+BM_BiquadFilterFloatOptimized/0/2/10           4412 ns         4398 ns       159164
+BM_BiquadFilterFloatOptimized/0/2/11           4413 ns         4399 ns       159138
+BM_BiquadFilterFloatOptimized/0/2/12           3177 ns         3167 ns       221023
+BM_BiquadFilterFloatOptimized/0/2/13           3164 ns         3154 ns       221972
+BM_BiquadFilterFloatOptimized/0/2/14           3225 ns         3211 ns       217654
+BM_BiquadFilterFloatOptimized/0/2/15           3178 ns         3167 ns       221055
+BM_BiquadFilterFloatOptimized/0/2/16           3726 ns         3714 ns       188557
+BM_BiquadFilterFloatOptimized/0/2/17           3726 ns         3716 ns       188151
+BM_BiquadFilterFloatOptimized/0/2/18           3734 ns         3721 ns       188243
+BM_BiquadFilterFloatOptimized/0/2/19           3723 ns         3710 ns       188560
+BM_BiquadFilterFloatOptimized/0/2/20           3178 ns         3167 ns       221083
+BM_BiquadFilterFloatOptimized/0/2/21           3163 ns         3154 ns       221947
+BM_BiquadFilterFloatOptimized/0/2/22           3224 ns         3214 ns       218373
+BM_BiquadFilterFloatOptimized/0/2/23           3177 ns         3167 ns       221028
+BM_BiquadFilterFloatOptimized/0/2/24           3727 ns         3714 ns       188443
+BM_BiquadFilterFloatOptimized/0/2/25           3735 ns         3721 ns       188131
+BM_BiquadFilterFloatOptimized/0/2/26           3732 ns         3719 ns       188374
+BM_BiquadFilterFloatOptimized/0/2/27           3721 ns         3710 ns       188619
+BM_BiquadFilterFloatOptimized/0/2/28           3176 ns         3167 ns       221067
+BM_BiquadFilterFloatOptimized/0/2/29           3164 ns         3154 ns       221953
+BM_BiquadFilterFloatOptimized/0/2/30           3225 ns         3214 ns       217988
+BM_BiquadFilterFloatOptimized/0/2/31           3176 ns         3167 ns       221015
+BM_BiquadFilterFloatOptimized/0/3/1             877 ns          874 ns       800012
+BM_BiquadFilterFloatOptimized/0/3/2            1218 ns         1214 ns       576381
+BM_BiquadFilterFloatOptimized/0/3/3            1217 ns         1214 ns       577767
+BM_BiquadFilterFloatOptimized/0/3/4            2281 ns         2274 ns       307760
+BM_BiquadFilterFloatOptimized/0/3/5            2285 ns         2278 ns       307313
+BM_BiquadFilterFloatOptimized/0/3/6            2285 ns         2278 ns       307254
+BM_BiquadFilterFloatOptimized/0/3/7            2280 ns         2273 ns       307865
+BM_BiquadFilterFloatOptimized/0/3/8            2966 ns         2957 ns       236544
+BM_BiquadFilterFloatOptimized/0/3/9            2945 ns         2936 ns       238459
+BM_BiquadFilterFloatOptimized/0/3/10           4613 ns         4597 ns       152280
+BM_BiquadFilterFloatOptimized/0/3/11           4612 ns         4597 ns       152296
+BM_BiquadFilterFloatOptimized/0/3/12           3499 ns         3489 ns       200637
+BM_BiquadFilterFloatOptimized/0/3/13           3498 ns         3486 ns       200771
+BM_BiquadFilterFloatOptimized/0/3/14           3569 ns         3557 ns       196782
+BM_BiquadFilterFloatOptimized/0/3/15           3500 ns         3489 ns       200662
+BM_BiquadFilterFloatOptimized/0/3/16           3809 ns         3797 ns       184356
+BM_BiquadFilterFloatOptimized/0/3/17           3817 ns         3804 ns       184009
+BM_BiquadFilterFloatOptimized/0/3/18           3818 ns         3804 ns       183988
+BM_BiquadFilterFloatOptimized/0/3/19           3809 ns         3797 ns       184373
+BM_BiquadFilterFloatOptimized/0/3/20           3501 ns         3489 ns       200657
+BM_BiquadFilterFloatOptimized/0/3/21           3497 ns         3486 ns       200769
+BM_BiquadFilterFloatOptimized/0/3/22           3567 ns         3556 ns       196867
+BM_BiquadFilterFloatOptimized/0/3/23           3500 ns         3489 ns       200647
+BM_BiquadFilterFloatOptimized/0/3/24           3808 ns         3796 ns       184354
+BM_BiquadFilterFloatOptimized/0/3/25           3816 ns         3805 ns       184002
+BM_BiquadFilterFloatOptimized/0/3/26           3816 ns         3804 ns       184006
+BM_BiquadFilterFloatOptimized/0/3/27           3809 ns         3797 ns       184416
+BM_BiquadFilterFloatOptimized/0/3/28           3500 ns         3488 ns       200657
+BM_BiquadFilterFloatOptimized/0/3/29           3498 ns         3486 ns       200786
+BM_BiquadFilterFloatOptimized/0/3/30           3568 ns         3557 ns       196887
+BM_BiquadFilterFloatOptimized/0/3/31           3500 ns         3488 ns       200663
+BM_BiquadFilterFloatOptimized/0/4/1             558 ns          556 ns      1257930
+BM_BiquadFilterFloatOptimized/0/4/2             652 ns          650 ns      1076427
+BM_BiquadFilterFloatOptimized/0/4/3             651 ns          648 ns      1079429
+BM_BiquadFilterFloatOptimized/0/4/4             831 ns          829 ns       844257
+BM_BiquadFilterFloatOptimized/0/4/5             829 ns          826 ns       847191
+BM_BiquadFilterFloatOptimized/0/4/6             829 ns          826 ns       847010
+BM_BiquadFilterFloatOptimized/0/4/7             832 ns          829 ns       843914
+BM_BiquadFilterFloatOptimized/0/4/8            1881 ns         1875 ns       373166
+BM_BiquadFilterFloatOptimized/0/4/9            1910 ns         1904 ns       367626
+BM_BiquadFilterFloatOptimized/0/4/10           2247 ns         2239 ns       312581
+BM_BiquadFilterFloatOptimized/0/4/11           2246 ns         2238 ns       312874
+BM_BiquadFilterFloatOptimized/0/4/12           3170 ns         3158 ns       221666
+BM_BiquadFilterFloatOptimized/0/4/13           3159 ns         3150 ns       222273
+BM_BiquadFilterFloatOptimized/0/4/14           3149 ns         3139 ns       222959
+BM_BiquadFilterFloatOptimized/0/4/15           3168 ns         3158 ns       221668
+BM_BiquadFilterFloatOptimized/0/4/16           2278 ns         2271 ns       308250
+BM_BiquadFilterFloatOptimized/0/4/17           2280 ns         2273 ns       308036
+BM_BiquadFilterFloatOptimized/0/4/18           2280 ns         2273 ns       308016
+BM_BiquadFilterFloatOptimized/0/4/19           2278 ns         2271 ns       308301
+BM_BiquadFilterFloatOptimized/0/4/20           3168 ns         3158 ns       221671
+BM_BiquadFilterFloatOptimized/0/4/21           3159 ns         3150 ns       222270
+BM_BiquadFilterFloatOptimized/0/4/22           3149 ns         3139 ns       223010
+BM_BiquadFilterFloatOptimized/0/4/23           3168 ns         3158 ns       221652
+BM_BiquadFilterFloatOptimized/0/4/24           2279 ns         2271 ns       308191
+BM_BiquadFilterFloatOptimized/0/4/25           2281 ns         2273 ns       307942
+BM_BiquadFilterFloatOptimized/0/4/26           2280 ns         2272 ns       308012
+BM_BiquadFilterFloatOptimized/0/4/27           2279 ns         2271 ns       308357
+BM_BiquadFilterFloatOptimized/0/4/28           3169 ns         3158 ns       221700
+BM_BiquadFilterFloatOptimized/0/4/29           3159 ns         3149 ns       222286
+BM_BiquadFilterFloatOptimized/0/4/30           3149 ns         3139 ns       222997
+BM_BiquadFilterFloatOptimized/0/4/31           3168 ns         3158 ns       221672
+BM_BiquadFilterFloatOptimized/1/1/1             558 ns          556 ns      1259230
+BM_BiquadFilterFloatOptimized/1/1/2             651 ns          649 ns      1078239
+BM_BiquadFilterFloatOptimized/1/1/3             651 ns          649 ns      1078731
+BM_BiquadFilterFloatOptimized/1/1/4             771 ns          768 ns       898703
+BM_BiquadFilterFloatOptimized/1/1/5            1020 ns         1017 ns       712070
+BM_BiquadFilterFloatOptimized/1/1/6             796 ns          794 ns       867607
+BM_BiquadFilterFloatOptimized/1/1/7             816 ns          814 ns       895946
+BM_BiquadFilterFloatOptimized/1/1/8            1976 ns         1970 ns       355331
+BM_BiquadFilterFloatOptimized/1/1/9            1976 ns         1969 ns       355435
+BM_BiquadFilterFloatOptimized/1/1/10           2709 ns         2700 ns       259919
+BM_BiquadFilterFloatOptimized/1/1/11           2617 ns         2608 ns       268279
+BM_BiquadFilterFloatOptimized/1/1/12           2501 ns         2494 ns       280784
+BM_BiquadFilterFloatOptimized/1/1/13           2500 ns         2492 ns       280890
+BM_BiquadFilterFloatOptimized/1/1/14           2502 ns         2493 ns       280685
+BM_BiquadFilterFloatOptimized/1/1/15           2502 ns         2493 ns       280729
+BM_BiquadFilterFloatOptimized/1/1/16           2329 ns         2322 ns       301460
+BM_BiquadFilterFloatOptimized/1/1/17           2330 ns         2322 ns       301456
+BM_BiquadFilterFloatOptimized/1/1/18           2329 ns         2322 ns       301447
+BM_BiquadFilterFloatOptimized/1/1/19           2329 ns         2322 ns       301456
+BM_BiquadFilterFloatOptimized/1/1/20           2502 ns         2494 ns       280714
+BM_BiquadFilterFloatOptimized/1/1/21           2501 ns         2492 ns       280834
+BM_BiquadFilterFloatOptimized/1/1/22           2502 ns         2494 ns       280713
+BM_BiquadFilterFloatOptimized/1/1/23           2502 ns         2494 ns       280691
+BM_BiquadFilterFloatOptimized/1/1/24           2329 ns         2322 ns       301435
+BM_BiquadFilterFloatOptimized/1/1/25           2330 ns         2322 ns       301438
+BM_BiquadFilterFloatOptimized/1/1/26           2329 ns         2322 ns       301470
+BM_BiquadFilterFloatOptimized/1/1/27           2330 ns         2322 ns       301493
+BM_BiquadFilterFloatOptimized/1/1/28           2502 ns         2493 ns       280702
+BM_BiquadFilterFloatOptimized/1/1/29           2500 ns         2492 ns       280940
+BM_BiquadFilterFloatOptimized/1/1/30           2502 ns         2494 ns       280740
+BM_BiquadFilterFloatOptimized/1/1/31           2502 ns         2494 ns       280719
+BM_BiquadFilterFloatOptimized/1/2/1             440 ns          439 ns      1595119
+BM_BiquadFilterFloatOptimized/1/2/2             634 ns          631 ns      1109077
+BM_BiquadFilterFloatOptimized/1/2/3             633 ns          631 ns      1108421
+BM_BiquadFilterFloatOptimized/1/2/4            1523 ns         1518 ns       460928
+BM_BiquadFilterFloatOptimized/1/2/5            1524 ns         1518 ns       461034
+BM_BiquadFilterFloatOptimized/1/2/6            1524 ns         1518 ns       460936
+BM_BiquadFilterFloatOptimized/1/2/7            1524 ns         1519 ns       460956
+BM_BiquadFilterFloatOptimized/1/2/8            2871 ns         2862 ns       243633
+BM_BiquadFilterFloatOptimized/1/2/9            2808 ns         2800 ns       249997
+BM_BiquadFilterFloatOptimized/1/2/10           4412 ns         4397 ns       159195
+BM_BiquadFilterFloatOptimized/1/2/11           4412 ns         4398 ns       159154
+BM_BiquadFilterFloatOptimized/1/2/12           3177 ns         3167 ns       221084
+BM_BiquadFilterFloatOptimized/1/2/13           3164 ns         3154 ns       221939
+BM_BiquadFilterFloatOptimized/1/2/14           3217 ns         3210 ns       218007
+BM_BiquadFilterFloatOptimized/1/2/15           3177 ns         3167 ns       221047
+BM_BiquadFilterFloatOptimized/1/2/16           3726 ns         3713 ns       188559
+BM_BiquadFilterFloatOptimized/1/2/17           3733 ns         3720 ns       188289
+BM_BiquadFilterFloatOptimized/1/2/18           3733 ns         3721 ns       188122
+BM_BiquadFilterFloatOptimized/1/2/19           3724 ns         3712 ns       188522
+BM_BiquadFilterFloatOptimized/1/2/20           3177 ns         3167 ns       221061
+BM_BiquadFilterFloatOptimized/1/2/21           3164 ns         3154 ns       221952
+BM_BiquadFilterFloatOptimized/1/2/22           3224 ns         3213 ns       217980
+BM_BiquadFilterFloatOptimized/1/2/23           3178 ns         3167 ns       221046
+BM_BiquadFilterFloatOptimized/1/2/24           3726 ns         3714 ns       188525
+BM_BiquadFilterFloatOptimized/1/2/25           3732 ns         3720 ns       188234
+BM_BiquadFilterFloatOptimized/1/2/26           3732 ns         3719 ns       188156
+BM_BiquadFilterFloatOptimized/1/2/27           3726 ns         3714 ns       188613
+BM_BiquadFilterFloatOptimized/1/2/28           3177 ns         3167 ns       221042
+BM_BiquadFilterFloatOptimized/1/2/29           3164 ns         3154 ns       221970
+BM_BiquadFilterFloatOptimized/1/2/30           3226 ns         3215 ns       217798
+BM_BiquadFilterFloatOptimized/1/2/31           3178 ns         3167 ns       221042
+BM_BiquadFilterFloatOptimized/1/3/1             885 ns          882 ns       795133
+BM_BiquadFilterFloatOptimized/1/3/2            1219 ns         1214 ns       576293
+BM_BiquadFilterFloatOptimized/1/3/3            1218 ns         1214 ns       576722
+BM_BiquadFilterFloatOptimized/1/3/4            2282 ns         2274 ns       307745
+BM_BiquadFilterFloatOptimized/1/3/5            2286 ns         2278 ns       307324
+BM_BiquadFilterFloatOptimized/1/3/6            2286 ns         2278 ns       307308
+BM_BiquadFilterFloatOptimized/1/3/7            2282 ns         2274 ns       307912
+BM_BiquadFilterFloatOptimized/1/3/8            2962 ns         2952 ns       237180
+BM_BiquadFilterFloatOptimized/1/3/9            2946 ns         2935 ns       238462
+BM_BiquadFilterFloatOptimized/1/3/10           4612 ns         4597 ns       152246
+BM_BiquadFilterFloatOptimized/1/3/11           4613 ns         4596 ns       152286
+BM_BiquadFilterFloatOptimized/1/3/12           3501 ns         3489 ns       200662
+BM_BiquadFilterFloatOptimized/1/3/13           3497 ns         3486 ns       200784
+BM_BiquadFilterFloatOptimized/1/3/14           3569 ns         3557 ns       196804
+BM_BiquadFilterFloatOptimized/1/3/15           3499 ns         3488 ns       200661
+BM_BiquadFilterFloatOptimized/1/3/16           3809 ns         3797 ns       184350
+BM_BiquadFilterFloatOptimized/1/3/17           3816 ns         3804 ns       184028
+BM_BiquadFilterFloatOptimized/1/3/18           3815 ns         3804 ns       184008
+BM_BiquadFilterFloatOptimized/1/3/19           3808 ns         3796 ns       184333
+BM_BiquadFilterFloatOptimized/1/3/20           3502 ns         3489 ns       200636
+BM_BiquadFilterFloatOptimized/1/3/21           3499 ns         3486 ns       200768
+BM_BiquadFilterFloatOptimized/1/3/22           3569 ns         3557 ns       196840
+BM_BiquadFilterFloatOptimized/1/3/23           3501 ns         3488 ns       200657
+BM_BiquadFilterFloatOptimized/1/3/24           3807 ns         3796 ns       184403
+BM_BiquadFilterFloatOptimized/1/3/25           3816 ns         3804 ns       184040
+BM_BiquadFilterFloatOptimized/1/3/26           3816 ns         3804 ns       184021
+BM_BiquadFilterFloatOptimized/1/3/27           3808 ns         3796 ns       184385
+BM_BiquadFilterFloatOptimized/1/3/28           3500 ns         3488 ns       200666
+BM_BiquadFilterFloatOptimized/1/3/29           3497 ns         3485 ns       200811
+BM_BiquadFilterFloatOptimized/1/3/30           3571 ns         3558 ns       196974
+BM_BiquadFilterFloatOptimized/1/3/31           3499 ns         3488 ns       200710
+BM_BiquadFilterFloatOptimized/1/4/1             558 ns          556 ns      1259007
+BM_BiquadFilterFloatOptimized/1/4/2             652 ns          650 ns      1076207
+BM_BiquadFilterFloatOptimized/1/4/3             650 ns          648 ns      1079464
+BM_BiquadFilterFloatOptimized/1/4/4             831 ns          828 ns       847251
+BM_BiquadFilterFloatOptimized/1/4/5             829 ns          826 ns       847543
+BM_BiquadFilterFloatOptimized/1/4/6             829 ns          826 ns       847037
+BM_BiquadFilterFloatOptimized/1/4/7             832 ns          829 ns       844307
+BM_BiquadFilterFloatOptimized/1/4/8            1879 ns         1873 ns       378908
+BM_BiquadFilterFloatOptimized/1/4/9            1910 ns         1905 ns       367554
+BM_BiquadFilterFloatOptimized/1/4/10           2246 ns         2240 ns       312471
+BM_BiquadFilterFloatOptimized/1/4/11           2244 ns         2238 ns       312719
+BM_BiquadFilterFloatOptimized/1/4/12           3167 ns         3157 ns       221689
+BM_BiquadFilterFloatOptimized/1/4/13           3159 ns         3149 ns       222292
+BM_BiquadFilterFloatOptimized/1/4/14           3148 ns         3138 ns       223041
+BM_BiquadFilterFloatOptimized/1/4/15           3167 ns         3157 ns       221705
+BM_BiquadFilterFloatOptimized/1/4/16           2278 ns         2271 ns       308275
+BM_BiquadFilterFloatOptimized/1/4/17           2280 ns         2273 ns       308050
+BM_BiquadFilterFloatOptimized/1/4/18           2280 ns         2272 ns       307994
+BM_BiquadFilterFloatOptimized/1/4/19           2278 ns         2270 ns       308324
+BM_BiquadFilterFloatOptimized/1/4/20           3168 ns         3157 ns       221734
+BM_BiquadFilterFloatOptimized/1/4/21           3159 ns         3149 ns       222273
+BM_BiquadFilterFloatOptimized/1/4/22           3148 ns         3139 ns       222991
+BM_BiquadFilterFloatOptimized/1/4/23           3166 ns         3157 ns       221723
+BM_BiquadFilterFloatOptimized/1/4/24           2278 ns         2271 ns       308395
+BM_BiquadFilterFloatOptimized/1/4/25           2279 ns         2272 ns       308055
+BM_BiquadFilterFloatOptimized/1/4/26           2280 ns         2272 ns       308098
+BM_BiquadFilterFloatOptimized/1/4/27           2278 ns         2271 ns       308274
+BM_BiquadFilterFloatOptimized/1/4/28           3168 ns         3157 ns       221710
+BM_BiquadFilterFloatOptimized/1/4/29           3158 ns         3149 ns       222311
+BM_BiquadFilterFloatOptimized/1/4/30           3148 ns         3138 ns       223009
+BM_BiquadFilterFloatOptimized/1/4/31           3167 ns         3157 ns       221723
+BM_BiquadFilterFloatNonOptimized/0/1/31        2500 ns         2493 ns       280839
+BM_BiquadFilterFloatNonOptimized/0/2/31        4996 ns         4983 ns       140491
+BM_BiquadFilterFloatNonOptimized/0/3/31        7491 ns         7468 ns        93734
+BM_BiquadFilterFloatNonOptimized/0/4/31        9988 ns         9955 ns        70314
+BM_BiquadFilterFloatNonOptimized/0/5/31       12475 ns        12440 ns        56266
+BM_BiquadFilterFloatNonOptimized/0/6/31       14977 ns        14927 ns        46888
+BM_BiquadFilterFloatNonOptimized/0/7/31       17540 ns        17486 ns        40039
+BM_BiquadFilterFloatNonOptimized/0/8/31       19997 ns        19937 ns        35114
+BM_BiquadFilterFloatNonOptimized/0/9/31       22510 ns        22444 ns        31185
+BM_BiquadFilterFloatNonOptimized/0/10/31      25029 ns        24949 ns        28059
+BM_BiquadFilterFloatNonOptimized/0/11/31      27520 ns        27436 ns        25514
+BM_BiquadFilterFloatNonOptimized/0/12/31      30048 ns        29959 ns        23368
+BM_BiquadFilterFloatNonOptimized/0/13/31      32524 ns        32428 ns        21586
+BM_BiquadFilterFloatNonOptimized/0/14/31      35051 ns        34949 ns        20029
+BM_BiquadFilterFloatNonOptimized/0/15/31      37546 ns        37436 ns        18697
+BM_BiquadFilterFloatNonOptimized/0/16/31      40115 ns        39978 ns        17510
+BM_BiquadFilterFloatNonOptimized/0/17/31      42624 ns        42492 ns        16473
+BM_BiquadFilterFloatNonOptimized/0/18/31      45142 ns        45008 ns        15550
+BM_BiquadFilterFloatNonOptimized/0/19/31      47667 ns        47508 ns        14732
+BM_BiquadFilterFloatNonOptimized/0/20/31      50150 ns        50005 ns        13999
+BM_BiquadFilterFloatNonOptimized/0/21/31      52661 ns        52492 ns        13336
+BM_BiquadFilterFloatNonOptimized/0/22/31      55160 ns        54977 ns        12732
+BM_BiquadFilterFloatNonOptimized/0/23/31      57717 ns        57556 ns        12194
+BM_BiquadFilterFloatNonOptimized/0/24/31      60105 ns        59986 ns        11684
+BM_BiquadFilterDoubleOptimized/0/1/31          2498 ns         2491 ns       281105
+BM_BiquadFilterDoubleOptimized/0/2/31          3123 ns         3112 ns       224898
+BM_BiquadFilterDoubleOptimized/0/3/31          3435 ns         3425 ns       204393
+BM_BiquadFilterDoubleOptimized/0/4/31          3567 ns         3556 ns       196854
+BM_BiquadFilterDoubleNonOptimized/0/1/31       2498 ns         2490 ns       281119
+BM_BiquadFilterDoubleNonOptimized/0/2/31       5019 ns         5004 ns       100000
+BM_BiquadFilterDoubleNonOptimized/0/3/31       7500 ns         7478 ns        93607
+BM_BiquadFilterDoubleNonOptimized/0/4/31      10010 ns         9981 ns        70129
 
  *******************************************************************/
 
diff --git a/audio_utils/include/audio_utils/BiquadFilter.h b/audio_utils/include/audio_utils/BiquadFilter.h
index c2f481b..7a10339 100644
--- a/audio_utils/include/audio_utils/BiquadFilter.h
+++ b/audio_utils/include/audio_utils/BiquadFilter.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
-#define ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
+#pragma once
 
 #include "intrinsic_utils.h"
 
@@ -36,12 +35,146 @@
 #define USE_NEON
 #endif
 
+// Use dither to prevent subnormals for CPUs that raise an exception.
+#pragma push_macro("USE_DITHER")
+#undef USE_DITHER
+
+#if defined(__i386__) || defined(__x86_x64__)
+#define USE_DITHER
+#endif
+
 namespace android::audio_utils {
 
 static constexpr size_t kBiquadNumCoefs  = 5;
 static constexpr size_t kBiquadNumDelays = 2;
 
+/**
+ * The BiquadDirect2Transpose is a low overhead
+ * Biquad filter with coefficients b0, b1, b2, a1, a2.
+ *
+ * This can be used by itself, but it is preferred for best data management
+ * to use the BiquadFilter abstraction below.
+ *
+ * T is the data type (scalar or vector).
+ * F is the filter coefficient type.  It is either a scalar or vector (matching T).
+ */
+template <typename T, typename F>
+struct BiquadDirect2Transpose {
+    F coef_[5]; // these are stored with the denominator a's negated.
+    T s1_; // delay state 1
+    T s2_; // delay state 2
+
+    // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
+    // as expressed by a bitmask.
+    static inline constexpr size_t required_occupancies_[] = {
+        0x1,  // constant scale
+        0x3,  // single zero
+        0x7,  // double zero
+        0x9,  // single pole
+        0xb,  // (11) first order IIR
+        0x1b, // (27) double pole + single zero
+        0x1f, // (31) second order IIR (full Biquad)
+    };
+
+    // Take care the order of arguments - starts with b's then goes to a's.
+    // The a's are "positive" reference, some filters take negative.
+    BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
+            const T& s1 = {}, const T& s2 = {})
+        // : coef_{b0, b1, b2, -a1, -a2}
+        : coef_{ b0,
+            b1,
+            b2,
+            intrinsics::vneg(a1),
+            intrinsics::vneg(a2) }
+        , s1_{s1}
+        , s2_{s2} {
+    }
+
+    // D is the data type.  It must be the same element type of T or F.
+    // Take care the order of input and output.
+    template<typename D, size_t OCCUPANCY = 0x1f>
+    __attribute__((always_inline)) // required for 1ch speedup (30% faster)
+    void process(D* output, const D* input, size_t frames, size_t stride) {
+        using namespace intrinsics;
+        // For SSE it is possible to vdup F to T if F is scalar.
+        const F b0 = coef_[0];         // b0
+        const F b1 = coef_[1];         // b1
+        const F b2 = coef_[2];         // b2
+        const F negativeA1 = coef_[3]; // -a1
+        const F negativeA2 = coef_[4]; // -a2
+        T s1 = s1_;
+        T s2 = s2_;
+        T xn, yn; // OK to declare temps outside loop rather than at the point of initialization.
+#ifdef USE_DITHER
+        constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
+        T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
+#endif
+
+        // Unroll control.  Make sure the constexpr remains constexpr :-).
+        constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
+        constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2;   // below this won't be unrolled.
+        constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16;  // above this won't be unrolled.
+        constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
+                CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
+        size_t remainder = 0;
+        if constexpr (UNROLL_LOOPS > 1) {
+            remainder = frames % UNROLL_LOOPS;
+            frames /= UNROLL_LOOPS;
+        }
+
+        // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
+        // The other alternative is to use a MACRO, but that doesn't read as well.
+        const auto KERNEL = [&]() __attribute__((always_inline)) {
+            xn = vld1<T>(input);
+            input += stride;
+#ifdef USE_DITHER
+            xn = vadd(xn, dither);
+            dither = vneg(dither);
+#endif
+
+            yn = s1;
+            if constexpr (OCCUPANCY >> 0 & 1) {
+                yn = vmla(yn, b0, xn);
+            }
+            vst1(output, yn);
+            output += stride;
+
+            s1 = s2;
+            if constexpr (OCCUPANCY >> 3 & 1) {
+                s1 = vmla(s1, negativeA1, yn);
+            }
+            if constexpr (OCCUPANCY >> 1 & 1) {
+                s1 = vmla(s1, b1, xn);
+            }
+            if constexpr (OCCUPANCY >> 2 & 1) {
+                s2 = vmul(b2, xn);
+            } else {
+                s2 = vdupn<T>(0.f);
+            }
+            if constexpr (OCCUPANCY >> 4 & 1) {
+                s2 = vmla(s2, negativeA2, yn);
+            }
+        };
+
+        while (frames > 0) {
+            #pragma unroll
+            for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
+                KERNEL();
+            }
+            frames--;
+        }
+        if constexpr (UNROLL_LOOPS > 1) {
+            for (size_t i = 0; i < remainder; ++i) {
+                KERNEL();
+            }
+        }
+        s1_ = s1;
+        s2_ = s2;
+    }
+};
+
 namespace details {
+
 // Helper methods for constructing a constexpr array of function pointers.
 // As function pointers are efficient and have no constructor/destructor
 // this is preferred over std::function.
@@ -105,67 +238,6 @@
     }
 }
 
-// For biquad_filter_fast, we template based on whether coef[i] is nonzero - this should be
-// determined in a constexpr fashion for optimization.
-
-// Helper which takes a stride to allow column processing of interleaved audio streams.
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_1fast(D *out, const D *in, size_t frames, size_t stride,
-                         size_t channelCount, D *delays, const D *coefs, size_t localStride) {
-#if defined(__i386__) || defined(__x86_x64__)
-    D delta = std::numeric_limits<float>::min() * (1 << 24);
-#endif
-    D b0, b1, b2, negativeA1, negativeA2;
-
-    if constexpr (SAME_COEF_PER_CHANNEL) {
-        b0 = coefs[0];
-        b1 = coefs[1];
-        b2 = coefs[2];
-        negativeA1 = -coefs[3];
-        negativeA2 = -coefs[4];
-    }
-    for (size_t i = 0; i < channelCount; ++i) {
-        if constexpr (!SAME_COEF_PER_CHANNEL) {
-            b0 = coefs[0];
-            b1 = coefs[localStride];
-            b2 = coefs[2 * localStride];
-            negativeA1 = -coefs[3 * localStride];
-            negativeA2 = -coefs[4 * localStride];
-            ++coefs;
-        }
-
-        D s1n1 = delays[0];
-        D s2n1 = delays[localStride];
-        const D *input = &in[i];
-        D *output = &out[i];
-        for (size_t j = frames; j > 0; --j) {
-            // Adding a delta to avoid subnormal exception handling on the x86/x64 platform;
-            // this is not a problem with the ARM platform. The delta will not affect the
-            // precision of the result.
-#if defined(__i386__) || defined(__x86_x64__)
-            const D xn = *input + delta;
-#else
-            const D xn = *input;
-#endif
-            D yn = (OCCUPANCY >> 0 & 1) * b0 * xn + s1n1;
-            s1n1 = (OCCUPANCY >> 1 & 1) * b1 * xn + (OCCUPANCY >> 3 & 1) * negativeA1 * yn + s2n1;
-            s2n1 = (OCCUPANCY >> 2 & 1) * b2 * xn + (OCCUPANCY >> 4 & 1) * negativeA2 * yn;
-
-            input += stride;
-
-            *output = yn;
-            output += stride;
-
-#if defined(__i386__) || defined(__x86_x64__)
-            delta = -delta;
-#endif
-        }
-        delays[0] = s1n1;
-        delays[localStride] = s2n1;
-        ++delays;
-    }
-}
-
 // Helper function to zero channels in the input buffer.
 // This is used for the degenerate coefficient case which results in all zeroes.
 template <typename D>
@@ -180,90 +252,69 @@
     }
 }
 
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_fast(D *out, const D *in, size_t frames, size_t stride,
-        size_t channelCount, D *delays, const D *coefs, size_t localStride) {
-    if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
-        zeroChannels(out, frames, stride, channelCount);
-        return;
-    }
-    biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>(
-            out, in, frames, stride, channelCount, delays, coefs, localStride);
-}
-
-#ifdef USE_NEON
-
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
-void biquad_filter_neon_impl(F *out, const F *in, size_t frames, size_t stride,
+template <template <typename, typename> typename FilterType,
+        size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
+void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride,
         size_t channelCount, F *delays, const F *coefs, size_t localStride) {
     using namespace android::audio_utils::intrinsics;
 
     constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
-    T b0, b1, b2, negativeA1, negativeA2;
-    if constexpr (SAME_COEF_PER_CHANNEL) {
-        b0 = vdupn<T>(coefs[0]);
-        b1 = vdupn<T>(coefs[1]);
-        b2 = vdupn<T>(coefs[2]);
-        negativeA1 = vneg(vdupn<T>(coefs[3]));
-        negativeA2 = vneg(vdupn<T>(coefs[4]));
-    }
+    const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride;
+    using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>;
+
     for (size_t i = 0; i < channelCount; i += elements) {
-        if constexpr (!SAME_COEF_PER_CHANNEL) {
-            b0 = vld1<T>(coefs);
-            b1 = vld1<T>(coefs + localStride);
-            b2 = vld1<T>(coefs + localStride * 2);
-            negativeA1 = vneg(vld1<T>(coefs + localStride * 3));
-            negativeA2 = vneg(vld1<T>(coefs + localStride * 4));
-            coefs += elements;
-        }
         T s1 = vld1<T>(&delays[0]);
         T s2 = vld1<T>(&delays[localStride]);
-        const F *input = &in[i];
-        F *output = &out[i];
-        for (size_t j = frames; j > 0; --j) {
-            T xn = vld1<T>(input);
-            T yn = s1;
 
-            if constexpr (OCCUPANCY >> 0 & 1) {
-                yn = vmla(yn, b0, xn);
-            }
-            s1 = s2;
-            if constexpr (OCCUPANCY >> 3 & 1) {
-                s1 = vmla(s1, negativeA1, yn);
-            }
-            if constexpr (OCCUPANCY >> 1 & 1) {
-                s1 = vmla(s1, b1, xn);
-            }
-            if constexpr (OCCUPANCY >> 2 & 1) {
-                s2 = vmul(b2, xn);
-            } else {
-                s2 = vdupn<T>(0.f);
-            }
-            if constexpr (OCCUPANCY >> 4 & 1) {
-                s2 = vmla(s2, negativeA2, yn);
-            }
-
-            input += stride;
-            vst1(output, yn);
-            output += stride;
-        }
-        vst1(&delays[0], s1);
-        vst1(&delays[localStride], s2);
+        FilterType<T, CoefType> kernel(
+                vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride),
+                vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3),
+                vld1<CoefType>(coefs + coefStride * 4),
+                s1, s2);
+        if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements;
+        kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride);
+        vst1(&delays[0], kernel.s1_);
+        vst1(&delays[localStride], kernel.s2_);
         delays += elements;
     }
 }
 
-#define BIQUAD_FILTER_CASE(N, ... /* type */) \
+// Find the nearest occupancy mask that includes all the desired bits.
+template <typename T, size_t N>
+static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) {
+    if (occupancy < 32) {
+        for (auto test : occupancies) {
+            if ((occupancy & test) == occupancy) return test;
+        }
+    }
+    return 31;
+}
+
+enum FILTER_OPTION {
+    FILTER_OPTION_SCALAR_ONLY = (1 << 0),
+};
+
+// Default biquad type.
+template <typename T, typename F>
+using BiquadFilterType = BiquadDirect2Transpose<T, F>;
+
+#define BIQUAD_FILTER_CASE(N, FilterType, ... /* type */) \
             case N: { \
-                biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, __VA_ARGS__>( \
+                using VectorType = __VA_ARGS__; \
+                biquad_filter_func_impl< \
+                        FilterType, \
+                        nearestOccupancy(OCCUPANCY, \
+                                FilterType<VectorType, D>::required_occupancies_), \
+                        SAME_COEF_PER_CHANNEL, VectorType>( \
                         out + offset, in + offset, frames, stride, remaining, \
                         delays + offset, c, localStride); \
                 goto exit; \
             }
 
 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride,
-        size_t channelCount, D *delays, const D *coefs, size_t localStride) {
+void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride,
+        size_t channelCount, D *delays, const D *coefs, size_t localStride,
+        FILTER_OPTION filterOptions) {
     if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
         zeroChannels(out, frames, stride, channelCount);
         return;
@@ -274,41 +325,53 @@
     // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
     // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
 
+#ifdef USE_NEON
+    // use NEON types to ensure we have the proper intrinsic acceleration.
+    using alt_16_t = float32x4x4_t;
+    using alt_8_t = float32x4x2_t;
+    using alt_4_t = float32x4_t;
+#else
+    // Use C++ types, no NEON needed.
+    using alt_16_t = intrinsics::internal_array_t<float, 16>;
+    using alt_8_t = intrinsics::internal_array_t<float, 8>;
+    using alt_4_t = intrinsics::internal_array_t<float, 4>;
+#endif
+
     for (size_t offset = 0; offset < channelCount; ) {
         size_t remaining = channelCount - offset;
         auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
+        if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar;
         if constexpr (std::is_same_v<D, float>) {
             switch (remaining) {
             default:
                 if (remaining >= 16) {
                     remaining &= ~15;
-                    biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, float32x4x4_t>(
+                    biquad_filter_func_impl<
+                            BiquadFilterType,
+                            nearestOccupancy(OCCUPANCY,
+                                    BiquadFilterType<D, D>::required_occupancies_),
+                            SAME_COEF_PER_CHANNEL, alt_16_t>(
                             out + offset, in + offset, frames, stride, remaining,
                             delays + offset, c, localStride);
                     offset += remaining;
                     continue;
                 }
                 break;  // case 1 handled at bottom.
-            BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
-            BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
-            BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
-            BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
-            BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
-            BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
-            BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
-            // We choose the NEON intrinsic type over internal_array for 8 to
-            // check if there is any performance difference in benchmark (should be similar).
-            // BIQUAD_FILTER_CASE(8, intrinsics::internal_array_t<float, 8>)
-            BIQUAD_FILTER_CASE(8, float32x4x2_t)
-            BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
-            BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
-            BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
-            BIQUAD_FILTER_CASE(4, float32x4_t)
-            // We choose the NEON intrinsic type over internal_array for 4 to
-            // check if there is any performance difference in benchmark (should be similar).
-            // BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<float, 4>)
-            BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
-            BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
+            BIQUAD_FILTER_CASE(15, BiquadFilterType, intrinsics::internal_array_t<float, 15>)
+            BIQUAD_FILTER_CASE(14, BiquadFilterType, intrinsics::internal_array_t<float, 14>)
+            BIQUAD_FILTER_CASE(13, BiquadFilterType, intrinsics::internal_array_t<float, 13>)
+            BIQUAD_FILTER_CASE(12, BiquadFilterType, intrinsics::internal_array_t<float, 12>)
+            BIQUAD_FILTER_CASE(11, BiquadFilterType, intrinsics::internal_array_t<float, 11>)
+            BIQUAD_FILTER_CASE(10, BiquadFilterType, intrinsics::internal_array_t<float, 10>)
+            BIQUAD_FILTER_CASE(9, BiquadFilterType, intrinsics::internal_array_t<float, 9>)
+            BIQUAD_FILTER_CASE(8, BiquadFilterType, alt_8_t)
+            BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<float, 7>)
+            BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<float, 6>)
+            BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<float, 5>)
+            BIQUAD_FILTER_CASE(4, BiquadFilterType, alt_4_t)
+            BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<float, 3>)
+            BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<float, 2>)
+            // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>)
             }
         } else if constexpr (std::is_same_v<D, double>) {
 #if defined(__aarch64__)
@@ -316,27 +379,34 @@
             default:
                 if (remaining >= 8) {
                     remaining &= ~7;
-                    biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL,
-                              intrinsics::internal_array_t<double, 8>>(
+                    biquad_filter_func_impl<BiquadFilterType,
+                            nearestOccupancy(OCCUPANCY,
+                                    BiquadFilterType<D, D>::required_occupancies_),
+                            SAME_COEF_PER_CHANNEL,
+                            intrinsics::internal_array_t<double, 8>>(
                             out + offset, in + offset, frames, stride, remaining,
                             delays + offset, c, localStride);
                     offset += remaining;
                     continue;
                 }
                 break; // case 1 handled at bottom.
-            BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
-            BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
-            BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
-            BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
-            BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
-            BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
+            BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<double, 7>)
+            BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<double, 6>)
+            BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<double, 5>)
+            BIQUAD_FILTER_CASE(4, BiquadFilterType, intrinsics::internal_array_t<double, 4>)
+            BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<double, 3>)
+            BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<double, 2>)
             };
 #endif
         }
+        scalar:
         // Essentially the code below is scalar, the same as
         // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
         // but formulated with NEON intrinsic-like call pattern.
-        biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, D>(
+        biquad_filter_func_impl<BiquadFilterType,
+                nearestOccupancy(OCCUPANCY,
+                        BiquadFilterType<D, D>::required_occupancies_),
+                 SAME_COEF_PER_CHANNEL, D>(
                 out + offset, in + offset, frames, stride, remaining,
                 delays + offset, c, localStride);
         offset += remaining;
@@ -344,8 +414,6 @@
     exit:;
 }
 
-#endif // USE_NEON
-
 } // namespace details
 
 /**
@@ -584,16 +652,14 @@
         }
 
         // Select the proper filtering function from our array.
-        (void)optimized;                // avoid unused variable warning.
-        mFunc = mFilterFast[category];  // default if we don't have processor optimization.
-
-#ifdef USE_NEON
-        /* if constexpr (std::is_same_v<D, float>) */ {
-            if (optimized) {
-                mFunc = mFilterNeon[category];
-            }
+        if (optimized) {
+            mFilterOptions = (details::FILTER_OPTION)
+                    (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY);
+        } else {
+             mFilterOptions = (details::FILTER_OPTION)
+                     (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY);
         }
-#endif
+        mFunc = mFilterFuncs[category];
     }
 
     /**
@@ -603,7 +669,7 @@
      * \param in      pointer to the input data
      * \param frames  number of audio frames to be processed
      */
-    void process(D* out, const D *in, size_t frames) {
+    void process(D* out, const D* in, size_t frames) {
         process(out, in, frames, mChannelCount);
     }
 
@@ -615,10 +681,10 @@
      * \param frames  number of audio frames to be processed
      * \param stride  the total number of samples associated with a frame, if not channelCount.
      */
-    void process(D* out, const D *in, size_t frames, size_t stride) {
+    void process(D* out, const D* in, size_t frames, size_t stride) {
         assert(stride >= mChannelCount);
         mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
-                mCoefs.data(), mChannelCount);
+                mCoefs.data(), mChannelCount, mFilterOptions);
     }
 
     /**
@@ -655,7 +721,7 @@
                     auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
                     auto delays = mDelays.data() + fromEnd;
                     mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
-                            delays, coefs, mChannelCount);
+                            delays, coefs, mChannelCount, mFilterOptions);
                 }
 
                 auto delays = mDelays.data() + baseIdx;
@@ -664,13 +730,13 @@
                 // sliding one audio sample at a time.
                 mFunc(inout, inout,
                         frames - channelBlock + 1, 1 /* stride */, channelBlock,
-                        delays, coefs, mChannelCount);
+                        delays, coefs, mChannelCount, mFilterOptions);
 
                 // drain data pipe.
                 for (size_t i = 1; i < channelBlock; ++i) {
                     mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
                             1 /* frames */, 1 /* stride */, channelBlock - i,
-                            delays, coefs, mChannelCount);
+                            delays, coefs, mChannelCount, mFilterOptions);
                 }
             }
         }
@@ -681,7 +747,7 @@
             auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
             mFunc(inout, inout,
                     frames, 1 /* stride */, 1 /* channelCount */,
-                    mDelays.data() + fromEnd, coefs, mChannelCount);
+                    mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions);
         }
     }
 
@@ -746,121 +812,57 @@
      */
     std::vector<D> mDelays;
 
-    using filter_func = decltype(details::biquad_filter_fast<0, true, D>);
+    details::FILTER_OPTION mFilterOptions{};
 
-    /**
-     * \var filter_func* mFunc
+    // Consider making a separate delegation class.
+    /*
+     * We store an array of functions based on the occupancy.
      *
-     * The current filter function selected for the channel occupancy of the Biquad.
-     */
-    filter_func *mFunc;
-
-    // Create a functional wrapper to feed "biquad_filter_fast" to
-    // make_functional_array() to populate the array.
-    //
-    // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
-    // b0 b1 b2 a1 a2  (from lsb to msb)
-    template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
-    struct FuncWrap {
-        template<typename T>
-        static constexpr size_t nearest() {
-            // Combine cases to both improve expected performance and reduce code space.
-            // Some occupancy masks provide worse performance than more occupied masks.
-            constexpr size_t required_occupancies[] = {
-                1,  // constant scale
-                3,  // single zero
-                7,  // double zero
-                9,  // single pole
-                // 11, // first order IIR (unnecessary optimization, close enough to 31).
-                27, // double pole + single zero
-                31, // second order IIR (full Biquad)
-            };
-            if constexpr (OCCUPANCY < 32) {
-                for (auto test : required_occupancies) {
-                    if ((OCCUPANCY & test) == OCCUPANCY) return test;
-                }
-            } else {
-                static_assert(intrinsics::dependent_false_v<T>);
-            }
-            return 0; // never gets here.
-        }
-
-        static void func(D* out, const D *in, size_t frames, size_t stride,
-                size_t channelCount, D *delays, const D *coef, size_t localStride) {
-            constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
-            details::biquad_filter_fast<NEAREST_OCCUPANCY, SC>(
-                    out, in, frames, stride, channelCount, delays, coef, localStride);
-        }
-    };
-
-    /**
-     * \var mFilterFast
-     *
-     * std::array of functions based on coefficient occupancy.
+     * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
+     * b0 b1 b2 a1 a2  (from lsb to msb)
      *
      *  static inline constexpr std::array<filter_func*, M> mArray = {
-     *     biquad_filter_fast<0>,
-     *     biquad_filter_fast<1>,
-     *     biquad_filter_fast<2>,
+     *     biquad_filter_func<0>,
+     *     biquad_filter_func<1>,
+     *     biquad_filter_func<2>,
      *      ...
-     *     biquad_filter_fast<(1 << kBiquadNumCoefs) - 1>,
+     *     biquad_filter_func<(1 << kBiquadNumCoefs) - 1>,
      *  };
      *
      * Every time the coefficients are changed, we select the processing function from
      * this table.
      */
-    static inline constexpr auto mFilterFast =
-            details::make_functional_array<
-                    FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
 
-#ifdef USE_NEON
-    // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
-    // b0 b1 b2 a1 a2  (from lsb to msb)
-
+    // Used to build the functional array.
     template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
-    struct FuncWrapNeon {
-        template<typename T>
-        static constexpr size_t nearest() {
-            // combine cases to both improve expected performance and reduce code space.
-            //
-            // This lists the occupancies we will specialize functions for.
-            constexpr size_t required_occupancies[] = {
-                1,  // constant scale
-                3,  // single zero
-                7,  // double zero
-                9,  // single pole
-                11, // first order IIR
-                27, // double pole + single zero
-                31, // second order IIR (full Biquad)
-            };
-            if constexpr (OCCUPANCY < 32) {
-                for (auto test : required_occupancies) {
-                    if ((OCCUPANCY & test) == OCCUPANCY) return test;
-                }
-            } else {
-                static_assert(intrinsics::dependent_false_v<T>);
-            }
-            return 0; // never gets here.
-        }
-
+    struct FuncWrap {
         static void func(D* out, const D *in, size_t frames, size_t stride,
-                size_t channelCount, D *delays, const D *coef, size_t localStride) {
-            constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
-            details::biquad_filter_neon<NEAREST_OCCUPANCY, SC>(
-                    out, in, frames, stride, channelCount, delays, coef, localStride);
+                size_t channelCount, D *delays, const D *coef, size_t localStride,
+                details::FILTER_OPTION filterOptions) {
+            constexpr size_t NEAREST_OCCUPANCY =
+                details::nearestOccupancy(
+                        OCCUPANCY, details::BiquadFilterType<D, D>::required_occupancies_);
+            details::biquad_filter_func<NEAREST_OCCUPANCY, SC>(
+                    out, in, frames, stride, channelCount, delays, coef, localStride,
+                    filterOptions);
         }
     };
 
-    // Neon optimized array of functions.
-    static inline constexpr auto mFilterNeon =
+    // Vector optimized array of functions.
+    static inline constexpr auto mFilterFuncs =
             details::make_functional_array<
-                    FuncWrapNeon, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
-#endif // USE_NEON
+                    FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
 
+    /**
+     * \var filter_func* mFunc
+     *
+     * The current filter function selected for the channel occupancy of the Biquad.
+     * It will be one of mFilterFuncs.
+     */
+    std::decay_t<decltype(mFilterFuncs[0])> mFunc;
 };
 
 } // namespace android::audio_utils
 
+#pragma pop_macro("USE_DITHER")
 #pragma pop_macro("USE_NEON")
-
-#endif  // !ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
diff --git a/audio_utils/include/audio_utils/intrinsic_utils.h b/audio_utils/include/audio_utils/intrinsic_utils.h
index ed2b2bb..0c333e0 100644
--- a/audio_utils/include/audio_utils/intrinsic_utils.h
+++ b/audio_utils/include/audio_utils/intrinsic_utils.h
@@ -78,6 +78,45 @@
   using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
 */
 
+// add a + b
+template<typename T>
+static inline T vadd(T a, T b) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        return a + b;
+
+#ifdef USE_NEON
+    } else if constexpr (std::is_same_v<T, float32x2_t>) {
+        return vadd_f32(a, b);
+    } else if constexpr (std::is_same_v<T, float32x4_t>) {
+        return vaddq_f32(a, b);
+#if defined(__aarch64__)
+    } else if constexpr (std::is_same_v<T, float64x2_t>) {
+        return vaddq_f64(a, b);
+#endif
+#endif // USE_NEON
+
+    } else /* constexpr */ {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        const auto &[bval] = b;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vadd(aval[i], bval[i]);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             const auto &[b1, b2] = bval;
+             r1 = vadd(a1, b1);
+             r2 = vadd(a2, b2);
+             return ret;
+        }
+    }
+}
+
 // duplicate float into all elements.
 template<typename T, typename F>
 static inline T vdupn(F f) {
@@ -156,6 +195,73 @@
     }
 }
 
+/**
+ * Returns c as follows:
+ * c_i = a_i * b_i if a and b are the same vector type or
+ * c_i = a_i * b if a is a vector and b is scalar or
+ * c_i = a * b_i if a is scalar and b is a vector.
+ */
+template<typename T, typename S, typename F>
+static inline T vmla(T a, S b, F c) {
+    // Both types T and S are non-primitive and they are not equal.  T == S handled below.
+    (void) a;
+    (void) b;
+    (void) c;
+    static_assert(dependent_false_v<T>);
+}
+
+template<typename T, typename F>
+static inline T vmla(T a, T b, F c) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+            return a + b * c;
+        } else {
+            static_assert(dependent_false_v<T>);
+        }
+    } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+        // handle the lane variant
+#ifdef USE_NEON
+        if constexpr (std::is_same_v<T, float32x2_t>) {
+            return vmla_n_f32(a, b, c);
+        } else if constexpr (std::is_same_v<T, float32x4_t>) {
+            return vmlaq_n_f32(a, b,c);
+#if defined(__aarch64__)
+        } else if constexpr (std::is_same_v<T, float64x2_t>) {
+            return vmlaq_n_f64(a, b);
+#endif
+        } else
+#endif // USE_NEON
+        {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        const auto &[bval] = b;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vmla(aval[i], bval[i], c);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             const auto &[b1, b2] = bval;
+             r1 = vmla(a1, b1, c);
+             r2 = vmla(a2, b2, c);
+             return ret;
+        }
+        }
+    } else {
+        // Both types T and F are non-primitive and they are not equal.
+        static_assert(dependent_false_v<T>);
+    }
+}
+
+template<typename T, typename F>
+static inline T vmla(T a, F b, T c) {
+    return vmla(a, c, b);
+}
+
 // fused multiply-add a + b * c
 template<typename T>
 static inline T vmla(T a, T b, T c) {
@@ -197,7 +303,57 @@
     }
 }
 
-// multiply a * b
+/**
+ * Returns c as follows:
+ * c_i = a_i * b_i if a and b are the same vector type or
+ * c_i = a_i * b if a is a vector and b is scalar or
+ * c_i = a * b_i if a is scalar and b is a vector.
+ */
+template<typename T, typename F>
+static inline auto vmul(T a, F b) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+            return a * b;
+        } else /* constexpr */ {
+            return vmul(b, a); // we prefer T to be the vector/struct form.
+        }
+    } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+        // handle the lane variant
+#ifdef USE_NEON
+        if constexpr (std::is_same_v<T, float32x2_t>) {
+            return vmul_n_f32(a, b);
+        } else if constexpr (std::is_same_v<T, float32x4_t>) {
+            return vmulq_n_f32(a, b);
+#if defined(__aarch64__)
+        } else if constexpr (std::is_same_v<T, float64x2_t>) {
+            return vmulq_n_f64(a, b);
+#endif
+        } else
+#endif // USE_NEON
+        {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vmul(aval[i], b);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             r1 = vmul(a1, b);
+             r2 = vmul(a2, b);
+             return ret;
+        }
+        }
+    } else {
+        // Both types T and F are non-primitive and they are not equal.
+        static_assert(dependent_false_v<T>);
+    }
+}
+
 template<typename T>
 static inline T vmul(T a, T b) {
     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
@@ -308,6 +464,45 @@
     }
 }
 
+// subtract a - b
+template<typename T>
+static inline T vsub(T a, T b) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        return a - b;
+
+#ifdef USE_NEON
+    } else if constexpr (std::is_same_v<T, float32x2_t>) {
+        return vsub_f32(a, b);
+    } else if constexpr (std::is_same_v<T, float32x4_t>) {
+        return vsubq_f32(a, b);
+#if defined(__aarch64__)
+    } else if constexpr (std::is_same_v<T, float64x2_t>) {
+        return vsubq_f64(a, b);
+#endif
+#endif // USE_NEON
+
+    } else /* constexpr */ {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        const auto &[bval] = b;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vsub(aval[i], bval[i]);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             const auto &[b1, b2] = bval;
+             r1 = vsub(a1, b1);
+             r2 = vsub(a2, b2);
+             return ret;
+        }
+    }
+}
+
 } // namespace android::audio_utils::intrinsics
 
 #pragma pop_macro("USE_NEON")
diff --git a/audio_utils/tests/intrinsic_tests.cpp b/audio_utils/tests/intrinsic_tests.cpp
index 6a16747..d9686ef 100644
--- a/audio_utils/tests/intrinsic_tests.cpp
+++ b/audio_utils/tests/intrinsic_tests.cpp
@@ -25,6 +25,13 @@
 using FloatTypes = ::testing::Types<float, double>;
 TYPED_TEST_CASE(IntrisicUtilsTest, FloatTypes);
 
+TYPED_TEST(IntrisicUtilsTest, vadd) {
+    constexpr TypeParam a = 0.25f;
+    constexpr TypeParam b = 0.5f;
+    constexpr TypeParam result = a + b;
+    ASSERT_EQ(result, android::audio_utils::intrinsics::vadd(a, b));
+}
+
 TYPED_TEST(IntrisicUtilsTest, vdupn) {
     constexpr TypeParam value = 1.f;
     ASSERT_EQ(value, android::audio_utils::intrinsics::vdupn<TypeParam>(value));
@@ -62,3 +69,10 @@
             &destination, android::audio_utils::intrinsics::vdupn<TypeParam>(value));
     ASSERT_EQ(value, destination);
 }
+
+TYPED_TEST(IntrisicUtilsTest, vsub) {
+    constexpr TypeParam a = 1.25f;
+    constexpr TypeParam b = 1.5f;
+    constexpr TypeParam result = a - b;
+    ASSERT_EQ(result, android::audio_utils::intrinsics::vsub(a, b));
+}