MIPS optimizations for AEC audio processing module

Added new optimizations for MIPS that were removed in r6797.
For more information about this see https://code.google.com/p/webrtc/source/detail?r=6797

R=andrew@webrtc.org, djordje.pesut@imgtec.com

Review URL: https://webrtc-codereview.appspot.com/15259004

Patch from Ljubomir Papuga <ljubomir.papuga@mips.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7010 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft_mips.c b/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
index b19d6f2..6e51d3a 100644
--- a/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
@@ -11,7 +11,7 @@
 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"
 #include "webrtc/typedefs.h"
 
-static void bitrv2_128_mips(float *a) {
+static void bitrv2_128_mips(float* a) {
   // n is 128
   float xr, xi, yr, yi;
 
@@ -268,9 +268,543 @@
   a[119] = xi;
 }
 
-static void cftfsub_128_mips(float *a) {
-  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
-  float f0, f1, f2, f3, f4, f5, f6, f7;
+static void cft1st_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14;
+  int a_ptr, p1_rdft, p2_rdft, count;
+  float* first = rdft_wk3ri_first;
+  float* second = rdft_wk3ri_second;
+
+  __asm __volatile (
+    ".set       push                                                    \n\t"
+    ".set       noreorder                                               \n\t"
+    // first 8
+    "lwc1       %[f0],        0(%[a])                                   \n\t"
+    "lwc1       %[f1],        4(%[a])                                   \n\t"
+    "lwc1       %[f2],        8(%[a])                                   \n\t"
+    "lwc1       %[f3],        12(%[a])                                  \n\t"
+    "lwc1       %[f4],        16(%[a])                                  \n\t"
+    "lwc1       %[f5],        20(%[a])                                  \n\t"
+    "lwc1       %[f6],        24(%[a])                                  \n\t"
+    "lwc1       %[f7],        28(%[a])                                  \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f2],        %[f1],        %[f4]                       \n\t"
+    "add.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "add.s      %[f4],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "swc1       %[f7],        0(%[a])                                   \n\t"
+    "swc1       %[f8],        16(%[a])                                  \n\t"
+    "swc1       %[f2],        28(%[a])                                  \n\t"
+    "swc1       %[f1],        12(%[a])                                  \n\t"
+    "swc1       %[f4],        4(%[a])                                   \n\t"
+    "swc1       %[f6],        20(%[a])                                  \n\t"
+    "swc1       %[f3],        8(%[a])                                   \n\t"
+    "swc1       %[f0],        24(%[a])                                  \n\t"
+    // second 8
+    "lwc1       %[f0],        32(%[a])                                  \n\t"
+    "lwc1       %[f1],        36(%[a])                                  \n\t"
+    "lwc1       %[f2],        40(%[a])                                  \n\t"
+    "lwc1       %[f3],        44(%[a])                                  \n\t"
+    "lwc1       %[f4],        48(%[a])                                  \n\t"
+    "lwc1       %[f5],        52(%[a])                                  \n\t"
+    "lwc1       %[f6],        56(%[a])                                  \n\t"
+    "lwc1       %[f7],        60(%[a])                                  \n\t"
+    "add.s      %[f8],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "add.s      %[f7],        %[f4],        %[f1]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f1]                       \n\t"
+    "add.s      %[f1],        %[f3],        %[f8]                       \n\t"
+    "sub.s      %[f3],        %[f3],        %[f8]                       \n\t"
+    "sub.s      %[f8],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f6],        %[f2]                       \n\t"
+    "sub.s      %[f6],        %[f2],        %[f6]                       \n\t"
+    "lwc1       %[f9],        8(%[rdft_w])                              \n\t"
+    "sub.s      %[f2],        %[f8],        %[f7]                       \n\t"
+    "add.s      %[f8],        %[f8],        %[f7]                       \n\t"
+    "sub.s      %[f7],        %[f4],        %[f0]                       \n\t"
+    "add.s      %[f4],        %[f4],        %[f0]                       \n\t"
+    // prepare for loop
+    "addiu      %[a_ptr],     %[a],         64                          \n\t"
+    "addiu      %[p1_rdft],   %[rdft_w],    8                           \n\t"
+    "addiu      %[p2_rdft],   %[rdft_w],    16                          \n\t"
+    "addiu      %[count],     $zero,        7                           \n\t"
+    // finish second 8
+    "mul.s      %[f2],        %[f9],        %[f2]                       \n\t"
+    "mul.s      %[f8],        %[f9],        %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f9],        %[f7]                       \n\t"
+    "mul.s      %[f4],        %[f9],        %[f4]                       \n\t"
+    "swc1       %[f1],        32(%[a])                                  \n\t"
+    "swc1       %[f3],        52(%[a])                                  \n\t"
+    "swc1       %[f5],        36(%[a])                                  \n\t"
+    "swc1       %[f6],        48(%[a])                                  \n\t"
+    "swc1       %[f2],        40(%[a])                                  \n\t"
+    "swc1       %[f8],        44(%[a])                                  \n\t"
+    "swc1       %[f7],        56(%[a])                                  \n\t"
+    "swc1       %[f4],        60(%[a])                                  \n\t"
+    // loop
+   "1:                                                                  \n\t"
+    "lwc1       %[f0],        0(%[a_ptr])                               \n\t"
+    "lwc1       %[f1],        4(%[a_ptr])                               \n\t"
+    "lwc1       %[f2],        8(%[a_ptr])                               \n\t"
+    "lwc1       %[f3],        12(%[a_ptr])                              \n\t"
+    "lwc1       %[f4],        16(%[a_ptr])                              \n\t"
+    "lwc1       %[f5],        20(%[a_ptr])                              \n\t"
+    "lwc1       %[f6],        24(%[a_ptr])                              \n\t"
+    "lwc1       %[f7],        28(%[a_ptr])                              \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "lwc1       %[f10],       4(%[p1_rdft])                             \n\t"
+    "lwc1       %[f11],       0(%[p2_rdft])                             \n\t"
+    "lwc1       %[f12],       4(%[p2_rdft])                             \n\t"
+    "lwc1       %[f13],       8(%[first])                               \n\t"
+    "lwc1       %[f14],       12(%[first])                              \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "swc1       %[f7],        0(%[a_ptr])                               \n\t"
+    "swc1       %[f2],        4(%[a_ptr])                               \n\t"
+    "mul.s      %[f4],        %[f9],        %[f8]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f8],        %[f10],       %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f2],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f14],       %[f3]                       \n\t"
+    "nmsub.s    %[f4],        %[f4],        %[f10],       %[f6]         \n\t"
+    "madd.s     %[f8],        %[f8],        %[f9],        %[f6]         \n\t"
+    "nmsub.s    %[f7],        %[f7],        %[f12],       %[f5]         \n\t"
+    "madd.s     %[f0],        %[f0],        %[f11],       %[f5]         \n\t"
+    "nmsub.s    %[f2],        %[f2],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f3],        %[f3],        %[f13],       %[f1]         \n\t"
+#else
+    "mul.s      %[f7],        %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f6],        %[f9],        %[f6]                       \n\t"
+    "mul.s      %[f8],        %[f10],       %[f8]                       \n\t"
+    "mul.s      %[f2],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f5],        %[f12],       %[f5]                       \n\t"
+    "mul.s      %[f0],        %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f12],       %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "mul.s      %[f1],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f3],        %[f14],       %[f3]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f7]                       \n\t"
+    "add.s      %[f8],        %[f6],        %[f8]                       \n\t"
+    "sub.s      %[f7],        %[f2],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f11],       %[f0]                       \n\t"
+    "sub.s      %[f2],        %[f12],       %[f1]                       \n\t"
+    "add.s      %[f3],        %[f13],       %[f3]                       \n\t"
+#endif
+    "swc1       %[f4],        16(%[a_ptr])                              \n\t"
+    "swc1       %[f8],        20(%[a_ptr])                              \n\t"
+    "swc1       %[f7],        8(%[a_ptr])                               \n\t"
+    "swc1       %[f0],        12(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        24(%[a_ptr])                              \n\t"
+    "swc1       %[f3],        28(%[a_ptr])                              \n\t"
+    "lwc1       %[f0],        32(%[a_ptr])                              \n\t"
+    "lwc1       %[f1],        36(%[a_ptr])                              \n\t"
+    "lwc1       %[f2],        40(%[a_ptr])                              \n\t"
+    "lwc1       %[f3],        44(%[a_ptr])                              \n\t"
+    "lwc1       %[f4],        48(%[a_ptr])                              \n\t"
+    "lwc1       %[f5],        52(%[a_ptr])                              \n\t"
+    "lwc1       %[f6],        56(%[a_ptr])                              \n\t"
+    "lwc1       %[f7],        60(%[a_ptr])                              \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "lwc1       %[f11],       8(%[p2_rdft])                             \n\t"
+    "lwc1       %[f12],       12(%[p2_rdft])                            \n\t"
+    "lwc1       %[f13],       8(%[second])                              \n\t"
+    "lwc1       %[f14],       12(%[second])                             \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f2],        %[f8]                       \n\t"
+    "add.s      %[f2],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f3],        %[f6]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "swc1       %[f7],        32(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        36(%[a_ptr])                              \n\t"
+    "mul.s      %[f4],        %[f10],       %[f8]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f10],       %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f2],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "madd.s     %[f4],        %[f4],        %[f9],        %[f6]         \n\t"
+    "nmsub.s    %[f10],       %[f10],       %[f9],        %[f8]         \n\t"
+    "nmsub.s    %[f7],        %[f7],        %[f12],       %[f5]         \n\t"
+    "madd.s     %[f11],       %[f11],       %[f12],       %[f0]         \n\t"
+    "nmsub.s    %[f2],        %[f2],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f13],       %[f13],       %[f14],       %[f3]         \n\t"
+#else
+    "mul.s      %[f2],        %[f9],        %[f6]                       \n\t"
+    "mul.s      %[f10],       %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f9],        %[f9],        %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f8],        %[f12],       %[f5]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f12],       %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f5],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "mul.s      %[f14],       %[f14],       %[f3]                       \n\t"
+    "add.s      %[f4],        %[f4],        %[f2]                       \n\t"
+    "sub.s      %[f10],       %[f10],       %[f9]                       \n\t"
+    "sub.s      %[f7],        %[f7],        %[f8]                       \n\t"
+    "add.s      %[f11],       %[f11],       %[f12]                      \n\t"
+    "sub.s      %[f2],        %[f5],        %[f0]                       \n\t"
+    "add.s      %[f13],       %[f13],       %[f14]                      \n\t"
+#endif
+    "swc1       %[f4],        48(%[a_ptr])                              \n\t"
+    "swc1       %[f10],       52(%[a_ptr])                              \n\t"
+    "swc1       %[f7],        40(%[a_ptr])                              \n\t"
+    "swc1       %[f11],       44(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        56(%[a_ptr])                              \n\t"
+    "swc1       %[f13],       60(%[a_ptr])                              \n\t"
+    "addiu      %[count],     %[count],     -1                          \n\t"
+    "lwc1       %[f9],        8(%[p1_rdft])                             \n\t"
+    "addiu      %[a_ptr],     %[a_ptr],     64                          \n\t"
+    "addiu      %[p1_rdft],   %[p1_rdft],   8                           \n\t"
+    "addiu      %[p2_rdft],   %[p2_rdft],   16                          \n\t"
+    "addiu      %[first],     %[first],     8                           \n\t"
+    "bgtz       %[count],     1b                                        \n\t"
+    " addiu     %[second],    %[second],    8                           \n\t"
+    ".set       pop                                                     \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+      [f12] "=&f" (f12), [f13] "=&f" (f13), [f14] "=&f" (f14),
+      [a_ptr] "=&r" (a_ptr), [p1_rdft] "=&r" (p1_rdft), [first] "+r" (first),
+      [p2_rdft] "=&r" (p2_rdft), [count] "=&r" (count), [second] "+r" (second)
+    : [a] "r" (a), [rdft_w] "r" (rdft_w)
+    : "memory"
+  );
+}
+
+static void cftmdl_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14;
+  int tmp_a, count;
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],   %[a],         0               \n\t"
+    "addiu      %[count],   $zero,        4               \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],   %[count],     -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "add.s      %[f8],      %[f0],        %[f2]           \n\t"
+    "sub.s      %[f0],      %[f0],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f4],        %[f6]           \n\t"
+    "sub.s      %[f4],      %[f4],        %[f6]           \n\t"
+    "add.s      %[f6],      %[f1],        %[f3]           \n\t"
+    "sub.s      %[f1],      %[f1],        %[f3]           \n\t"
+    "add.s      %[f3],      %[f5],        %[f7]           \n\t"
+    "sub.s      %[f5],      %[f5],        %[f7]           \n\t"
+    "add.s      %[f7],      %[f8],        %[f2]           \n\t"
+    "sub.s      %[f8],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f1],        %[f4]           \n\t"
+    "sub.s      %[f1],      %[f1],        %[f4]           \n\t"
+    "add.s      %[f4],      %[f6],        %[f3]           \n\t"
+    "sub.s      %[f6],      %[f6],        %[f3]           \n\t"
+    "sub.s      %[f3],      %[f0],        %[f5]           \n\t"
+    "add.s      %[f0],      %[f0],        %[f5]           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                   \n\t"
+    "swc1       %[f8],      64(%[tmp_a])                  \n\t"
+    "swc1       %[f2],      36(%[tmp_a])                  \n\t"
+    "swc1       %[f1],      100(%[tmp_a])                 \n\t"
+    "swc1       %[f4],      4(%[tmp_a])                   \n\t"
+    "swc1       %[f6],      68(%[tmp_a])                  \n\t"
+    "swc1       %[f3],      32(%[tmp_a])                  \n\t"
+    "swc1       %[f0],      96(%[tmp_a])                  \n\t"
+    "bgtz       %[count],   1b                            \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+  f9 = rdft_w[2];
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],   %[a],         128             \n\t"
+    "addiu      %[count],   $zero,        4               \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],   %[count],     -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "sub.s      %[f8],      %[f0],        %[f2]           \n\t"
+    "add.s      %[f0],      %[f0],        %[f2]           \n\t"
+    "sub.s      %[f2],      %[f5],        %[f7]           \n\t"
+    "add.s      %[f5],      %[f5],        %[f7]           \n\t"
+    "sub.s      %[f7],      %[f1],        %[f3]           \n\t"
+    "add.s      %[f1],      %[f1],        %[f3]           \n\t"
+    "sub.s      %[f3],      %[f4],        %[f6]           \n\t"
+    "add.s      %[f4],      %[f4],        %[f6]           \n\t"
+    "sub.s      %[f6],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f8],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f5],        %[f1]           \n\t"
+    "sub.s      %[f5],      %[f5],        %[f1]           \n\t"
+    "add.s      %[f1],      %[f3],        %[f7]           \n\t"
+    "sub.s      %[f3],      %[f3],        %[f7]           \n\t"
+    "add.s      %[f7],      %[f0],        %[f4]           \n\t"
+    "sub.s      %[f0],      %[f0],        %[f4]           \n\t"
+    "sub.s      %[f4],      %[f6],        %[f1]           \n\t"
+    "add.s      %[f6],      %[f6],        %[f1]           \n\t"
+    "sub.s      %[f1],      %[f3],        %[f8]           \n\t"
+    "add.s      %[f3],      %[f3],        %[f8]           \n\t"
+    "mul.s      %[f4],      %[f4],        %[f9]           \n\t"
+    "mul.s      %[f6],      %[f6],        %[f9]           \n\t"
+    "mul.s      %[f1],      %[f1],        %[f9]           \n\t"
+    "mul.s      %[f3],      %[f3],        %[f9]           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                   \n\t"
+    "swc1       %[f2],      4(%[tmp_a])                   \n\t"
+    "swc1       %[f5],      64(%[tmp_a])                  \n\t"
+    "swc1       %[f0],      68(%[tmp_a])                  \n\t"
+    "swc1       %[f4],      32(%[tmp_a])                  \n\t"
+    "swc1       %[f6],      36(%[tmp_a])                  \n\t"
+    "swc1       %[f1],      96(%[tmp_a])                  \n\t"
+    "swc1       %[f3],      100(%[tmp_a])                 \n\t"
+    "bgtz       %[count],   1b                            \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a), [f9] "f" (f9)
+    : "memory"
+  );
+  f10 = rdft_w[3];
+  f11 = rdft_w[4];
+  f12 = rdft_w[5];
+  f13 = rdft_wk3ri_first[2];
+  f14 = rdft_wk3ri_first[3];
+
+  __asm __volatile (
+    ".set       push                                                    \n\t"
+    ".set       noreorder                                               \n\t"
+    "addiu      %[tmp_a],     %[a],         256                         \n\t"
+    "addiu      %[count],     $zero,        4                           \n\t"
+   "1:                                                                  \n\t"
+    "addiu      %[count],     %[count],     -1                          \n\t"
+    "lwc1       %[f0],        0(%[tmp_a])                               \n\t"
+    "lwc1       %[f2],        32(%[tmp_a])                              \n\t"
+    "lwc1       %[f4],        64(%[tmp_a])                              \n\t"
+    "lwc1       %[f6],        96(%[tmp_a])                              \n\t"
+    "lwc1       %[f1],        4(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],        36(%[tmp_a])                              \n\t"
+    "lwc1       %[f5],        68(%[tmp_a])                              \n\t"
+    "lwc1       %[f7],        100(%[tmp_a])                             \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f4],        %[f6],        %[f3]                       \n\t"
+    "add.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "swc1       %[f8],        0(%[tmp_a])                               \n\t"
+    "swc1       %[f6],        4(%[tmp_a])                               \n\t"
+    "mul.s      %[f5],        %[f9],        %[f7]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f7],        %[f10],       %[f7]                       \n\t"
+    "mul.s      %[f8],        %[f11],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f12],       %[f3]                       \n\t"
+    "mul.s      %[f6],        %[f13],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f0]                       \n\t"
+    "nmsub.s    %[f5],        %[f5],        %[f10],       %[f4]         \n\t"
+    "madd.s     %[f7],        %[f7],        %[f9],        %[f4]         \n\t"
+    "nmsub.s    %[f8],        %[f8],        %[f12],       %[f2]         \n\t"
+    "madd.s     %[f3],        %[f3],        %[f11],       %[f2]         \n\t"
+    "nmsub.s    %[f6],        %[f6],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f0],        %[f0],        %[f13],       %[f1]         \n\t"
+    "swc1       %[f5],        64(%[tmp_a])                              \n\t"
+    "swc1       %[f7],        68(%[tmp_a])                              \n\t"
+#else
+    "mul.s      %[f8],        %[f10],       %[f4]                       \n\t"
+    "mul.s      %[f4],        %[f9],        %[f4]                       \n\t"
+    "mul.s      %[f7],        %[f10],       %[f7]                       \n\t"
+    "mul.s      %[f6],        %[f11],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f12],       %[f3]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f8]                       \n\t"
+    "mul.s      %[f8],        %[f12],       %[f2]                       \n\t"
+    "mul.s      %[f2],        %[f11],       %[f2]                       \n\t"
+    "add.s      %[f7],        %[f4],        %[f7]                       \n\t"
+    "mul.s      %[f4],        %[f13],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f0]                       \n\t"
+    "sub.s      %[f8],        %[f6],        %[f8]                       \n\t"
+    "mul.s      %[f6],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f1],        %[f13],       %[f1]                       \n\t"
+    "add.s      %[f3],        %[f2],        %[f3]                       \n\t"
+    "swc1       %[f5],        64(%[tmp_a])                              \n\t"
+    "swc1       %[f7],        68(%[tmp_a])                              \n\t"
+    "sub.s      %[f6],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f0],        %[f1],        %[f0]                       \n\t"
+#endif
+    "swc1       %[f8],        32(%[tmp_a])                              \n\t"
+    "swc1       %[f3],        36(%[tmp_a])                              \n\t"
+    "swc1       %[f6],        96(%[tmp_a])                              \n\t"
+    "swc1       %[f0],        100(%[tmp_a])                             \n\t"
+    "bgtz       %[count],     1b                                        \n\t"
+    " addiu     %[tmp_a],     %[tmp_a],     8                           \n\t"
+    ".set       pop                                                     \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a),  [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11),
+      [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14)
+    : "memory"
+  );
+  f11 = rdft_w[6];
+  f12 = rdft_w[7];
+  f13 = rdft_wk3ri_second[2];
+  f14 = rdft_wk3ri_second[3];
+  __asm __volatile (
+    ".set       push                                                       \n\t"
+    ".set       noreorder                                                  \n\t"
+    "addiu      %[tmp_a],       %[a],           384                        \n\t"
+    "addiu      %[count],       $zero,          4                          \n\t"
+   "1:                                                                     \n\t"
+    "addiu      %[count],       %[count],       -1                         \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])                                \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])                                \n\t"
+    "lwc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],          36(%[tmp_a])                               \n\t"
+    "lwc1       %[f4],          64(%[tmp_a])                               \n\t"
+    "lwc1       %[f5],          68(%[tmp_a])                               \n\t"
+    "lwc1       %[f6],          96(%[tmp_a])                               \n\t"
+    "lwc1       %[f7],          100(%[tmp_a])                              \n\t"
+    "add.s      %[f8],          %[f0],          %[f2]                      \n\t"
+    "sub.s      %[f0],          %[f0],          %[f2]                      \n\t"
+    "add.s      %[f2],          %[f4],          %[f6]                      \n\t"
+    "sub.s      %[f4],          %[f4],          %[f6]                      \n\t"
+    "add.s      %[f6],          %[f1],          %[f3]                      \n\t"
+    "sub.s      %[f1],          %[f1],          %[f3]                      \n\t"
+    "add.s      %[f3],          %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f5],          %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f7],          %[f2],          %[f8]                      \n\t"
+    "add.s      %[f2],          %[f2],          %[f8]                      \n\t"
+    "add.s      %[f8],          %[f1],          %[f4]                      \n\t"
+    "sub.s      %[f1],          %[f1],          %[f4]                      \n\t"
+    "sub.s      %[f4],          %[f3],          %[f6]                      \n\t"
+    "add.s      %[f3],          %[f3],          %[f6]                      \n\t"
+    "sub.s      %[f6],          %[f0],          %[f5]                      \n\t"
+    "add.s      %[f0],          %[f0],          %[f5]                      \n\t"
+    "swc1       %[f2],          0(%[tmp_a])                                \n\t"
+    "swc1       %[f3],          4(%[tmp_a])                                \n\t"
+    "mul.s      %[f5],          %[f10],         %[f7]                      \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f7],          %[f9],          %[f7]                      \n\t"
+    "mul.s      %[f2],          %[f12],         %[f8]                      \n\t"
+    "mul.s      %[f8],          %[f11],         %[f8]                      \n\t"
+    "mul.s      %[f3],          %[f14],         %[f1]                      \n\t"
+    "mul.s      %[f1],          %[f13],         %[f1]                      \n\t"
+    "madd.s     %[f5],          %[f5],          %[f9],       %[f4]         \n\t"
+    "msub.s     %[f7],          %[f7],          %[f10],      %[f4]         \n\t"
+    "msub.s     %[f2],          %[f2],          %[f11],      %[f6]         \n\t"
+    "madd.s     %[f8],          %[f8],          %[f12],      %[f6]         \n\t"
+    "msub.s     %[f3],          %[f3],          %[f13],      %[f0]         \n\t"
+    "madd.s     %[f1],          %[f1],          %[f14],      %[f0]         \n\t"
+    "swc1       %[f5],          64(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          68(%[tmp_a])                               \n\t"
+#else
+    "mul.s      %[f2],          %[f9],          %[f4]                      \n\t"
+    "mul.s      %[f4],          %[f10],         %[f4]                      \n\t"
+    "mul.s      %[f7],          %[f9],          %[f7]                      \n\t"
+    "mul.s      %[f3],          %[f11],         %[f6]                      \n\t"
+    "mul.s      %[f6],          %[f12],         %[f6]                      \n\t"
+    "add.s      %[f5],          %[f5],          %[f2]                      \n\t"
+    "sub.s      %[f7],          %[f4],          %[f7]                      \n\t"
+    "mul.s      %[f2],          %[f12],         %[f8]                      \n\t"
+    "mul.s      %[f8],          %[f11],         %[f8]                      \n\t"
+    "mul.s      %[f4],          %[f14],         %[f1]                      \n\t"
+    "mul.s      %[f1],          %[f13],         %[f1]                      \n\t"
+    "sub.s      %[f2],          %[f3],          %[f2]                      \n\t"
+    "mul.s      %[f3],          %[f13],         %[f0]                      \n\t"
+    "mul.s      %[f0],          %[f14],         %[f0]                      \n\t"
+    "add.s      %[f8],          %[f8],          %[f6]                      \n\t"
+    "swc1       %[f5],          64(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          68(%[tmp_a])                               \n\t"
+    "sub.s      %[f3],          %[f3],          %[f4]                      \n\t"
+    "add.s      %[f1],          %[f1],          %[f0]                      \n\t"
+#endif
+    "swc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "swc1       %[f8],          36(%[tmp_a])                               \n\t"
+    "swc1       %[f3],          96(%[tmp_a])                               \n\t"
+    "swc1       %[f1],          100(%[tmp_a])                              \n\t"
+    "bgtz       %[count],       1b                                         \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],       8                          \n\t"
+    ".set       pop                                                        \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a), [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11),
+      [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14)
+    : "memory"
+  );
+}
+
+static void cftfsub_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8;
   int tmp_a, count;
 
   cft1st_128(a);
@@ -284,52 +818,370 @@
    "1:                                                    \n\t"
     "addiu      %[count],       %[count],     -1          \n\t"
     "lwc1       %[f0],          0(%[tmp_a])               \n\t"
-    "lwc1       %[f1],          4(%[tmp_a])               \n\t"
     "lwc1       %[f2],          128(%[tmp_a])             \n\t"
-    "lwc1       %[f3],          132(%[tmp_a])             \n\t"
     "lwc1       %[f4],          256(%[tmp_a])             \n\t"
-    "lwc1       %[f5],          260(%[tmp_a])             \n\t"
     "lwc1       %[f6],          384(%[tmp_a])             \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])               \n\t"
+    "lwc1       %[f3],          132(%[tmp_a])             \n\t"
+    "lwc1       %[f5],          260(%[tmp_a])             \n\t"
     "lwc1       %[f7],          388(%[tmp_a])             \n\t"
-    "add.s      %[x0r],         %[f0],        %[f2]       \n\t"
-    "add.s      %[x0i],         %[f1],        %[f3]       \n\t"
-    "add.s      %[x2r],         %[f4],        %[f6]       \n\t"
-    "add.s      %[x2i],         %[f5],        %[f7]       \n\t"
-    "sub.s      %[x1r],         %[f0],        %[f2]       \n\t"
-    "sub.s      %[x1i],         %[f1],        %[f3]       \n\t"
-    "sub.s      %[x3r],         %[f4],        %[f6]       \n\t"
-    "sub.s      %[x3i],         %[f5],        %[f7]       \n\t"
-    "add.s      %[f0],          %[x0r],       %[x2r]      \n\t"
-    "add.s      %[f1],          %[x0i],       %[x2i]      \n\t"
-    "sub.s      %[f4],          %[x0r],       %[x2r]      \n\t"
-    "sub.s      %[f5],          %[x0i],       %[x2i]      \n\t"
-    "sub.s      %[f2],          %[x1r],       %[x3i]      \n\t"
-    "add.s      %[f3],          %[x1i],       %[x3r]      \n\t"
-    "add.s      %[f6],          %[x1r],       %[x3i]      \n\t"
-    "sub.s      %[f7],          %[x1i],       %[x3r]      \n\t"
-    "swc1       %[f0],          0(%[tmp_a])               \n\t"
-    "swc1       %[f1],          4(%[tmp_a])               \n\t"
-    "swc1       %[f2],          128(%[tmp_a])             \n\t"
-    "swc1       %[f3],          132(%[tmp_a])             \n\t"
-    "swc1       %[f4],          256(%[tmp_a])             \n\t"
-    "swc1       %[f5],          260(%[tmp_a])             \n\t"
-    "swc1       %[f6],          384(%[tmp_a])             \n\t"
-    "swc1       %[f7],          388(%[tmp_a])             \n\t"
+    "add.s      %[f8],          %[f0],        %[f2]       \n\t"
+    "sub.s      %[f0],          %[f0],        %[f2]       \n\t"
+    "add.s      %[f2],          %[f4],        %[f6]       \n\t"
+    "sub.s      %[f4],          %[f4],        %[f6]       \n\t"
+    "add.s      %[f6],          %[f1],        %[f3]       \n\t"
+    "sub.s      %[f1],          %[f1],        %[f3]       \n\t"
+    "add.s      %[f3],          %[f5],        %[f7]       \n\t"
+    "sub.s      %[f5],          %[f5],        %[f7]       \n\t"
+    "add.s      %[f7],          %[f8],        %[f2]       \n\t"
+    "sub.s      %[f8],          %[f8],        %[f2]       \n\t"
+    "add.s      %[f2],          %[f1],        %[f4]       \n\t"
+    "sub.s      %[f1],          %[f1],        %[f4]       \n\t"
+    "add.s      %[f4],          %[f6],        %[f3]       \n\t"
+    "sub.s      %[f6],          %[f6],        %[f3]       \n\t"
+    "sub.s      %[f3],          %[f0],        %[f5]       \n\t"
+    "add.s      %[f0],          %[f0],        %[f5]       \n\t"
+    "swc1       %[f7],          0(%[tmp_a])               \n\t"
+    "swc1       %[f8],          256(%[tmp_a])             \n\t"
+    "swc1       %[f2],          132(%[tmp_a])             \n\t"
+    "swc1       %[f1],          388(%[tmp_a])             \n\t"
+    "swc1       %[f4],          4(%[tmp_a])               \n\t"
+    "swc1       %[f6],          260(%[tmp_a])             \n\t"
+    "swc1       %[f3],          128(%[tmp_a])             \n\t"
+    "swc1       %[f0],          384(%[tmp_a])             \n\t"
     "bgtz       %[count],       1b                        \n\t"
     " addiu     %[tmp_a],       %[tmp_a],   8             \n\t"
     ".set       pop                                       \n\t"
     : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
       [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
-      [x0r] "=&f" (x0r), [x0i] "=&f" (x0i), [x1r] "=&f" (x1r),
-      [x1i] "=&f" (x1i), [x2r] "=&f" (x2r), [x2i] "=&f" (x2i),
-      [x3r] "=&f" (x3r), [x3i] "=&f" (x3i), [tmp_a] "=&r" (tmp_a),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a),
       [count] "=&r" (count)
     : [a] "r" (a)
     : "memory"
   );
 }
 
+static void cftbsub_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8;
+  int tmp_a, count;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+
+  __asm __volatile (
+    ".set       push                                        \n\t"
+    ".set       noreorder                                   \n\t"
+    "addiu      %[tmp_a],   %[a],           0               \n\t"
+    "addiu      %[count],   $zero,          16              \n\t"
+   "1:                                                      \n\t"
+    "addiu      %[count],   %[count],       -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                     \n\t"
+    "lwc1       %[f2],      128(%[tmp_a])                   \n\t"
+    "lwc1       %[f4],      256(%[tmp_a])                   \n\t"
+    "lwc1       %[f6],      384(%[tmp_a])                   \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                     \n\t"
+    "lwc1       %[f3],      132(%[tmp_a])                   \n\t"
+    "lwc1       %[f5],      260(%[tmp_a])                   \n\t"
+    "lwc1       %[f7],      388(%[tmp_a])                   \n\t"
+    "add.s      %[f8],      %[f0],          %[f2]           \n\t"
+    "sub.s      %[f0],      %[f0],          %[f2]           \n\t"
+    "add.s      %[f2],      %[f4],          %[f6]           \n\t"
+    "sub.s      %[f4],      %[f4],          %[f6]           \n\t"
+    "add.s      %[f6],      %[f1],          %[f3]           \n\t"
+    "sub.s      %[f1],      %[f3],          %[f1]           \n\t"
+    "add.s      %[f3],      %[f5],          %[f7]           \n\t"
+    "sub.s      %[f5],      %[f5],          %[f7]           \n\t"
+    "add.s      %[f7],      %[f8],          %[f2]           \n\t"
+    "sub.s      %[f8],      %[f8],          %[f2]           \n\t"
+    "sub.s      %[f2],      %[f1],          %[f4]           \n\t"
+    "add.s      %[f1],      %[f1],          %[f4]           \n\t"
+    "add.s      %[f4],      %[f3],          %[f6]           \n\t"
+    "sub.s      %[f6],      %[f3],          %[f6]           \n\t"
+    "sub.s      %[f3],      %[f0],          %[f5]           \n\t"
+    "add.s      %[f0],      %[f0],          %[f5]           \n\t"
+    "neg.s      %[f4],      %[f4]                           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                     \n\t"
+    "swc1       %[f8],      256(%[tmp_a])                   \n\t"
+    "swc1       %[f2],      132(%[tmp_a])                   \n\t"
+    "swc1       %[f1],      388(%[tmp_a])                   \n\t"
+    "swc1       %[f6],      260(%[tmp_a])                   \n\t"
+    "swc1       %[f3],      128(%[tmp_a])                   \n\t"
+    "swc1       %[f0],      384(%[tmp_a])                   \n\t"
+    "swc1       %[f4],       4(%[tmp_a])                     \n\t"
+    "bgtz       %[count],   1b                              \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],       8               \n\t"
+    ".set       pop                                         \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+}
+
+static void rftfsub_128_mips(float* a) {
+  const float* c = rdft_w + 32;
+  const float f0 = 0.5f;
+  float* a1 = &a[2];
+  float* a2 = &a[126];
+  const float* c1 = &c[1];
+  const float* c2 = &c[31];
+  float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+  int count;
+
+  __asm __volatile (
+    ".set      push                                             \n\t"
+    ".set      noreorder                                        \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "addiu     %[count],    $zero,        15                    \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "sub.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "add.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "nmsub.s   %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "madd.s    %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f3],        %[f6]                 \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "addiu     %[a1],       %[a1],        8                     \n\t"
+    "addiu     %[a2],       %[a2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        4                     \n\t"
+    "addiu     %[c2],       %[c2],        -4                    \n\t"
+   "1:                                                          \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "lwc1      %[f10],      -4(%[c2])                           \n\t"
+    "lwc1      %[f11],      8(%[a1])                            \n\t"
+    "lwc1      %[f12],      -8(%[a2])                           \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "sub.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "add.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "nmsub.s   %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "madd.s    %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f10],      %[f0],        %[f10]                \n\t"
+    "sub.s     %[f5],       %[f11],       %[f12]                \n\t"
+    "add.s     %[f7],       %[f13],       %[f14]                \n\t"
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f3],        %[f6]                 \n\t"
+    "mul.s     %[f8],       %[f10],       %[f5]                 \n\t"
+    "mul.s     %[f10],      %[f10],       %[f7]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f9],       %[f15],       %[f7]                 \n\t"
+    "mul.s     %[f15],      %[f15],       %[f5]                 \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f8],       %[f8],        %[f9]                 \n\t"
+    "add.s     %[f10],      %[f10],       %[f15]                \n\t"
+#else
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "nmsub.s   %[f8],       %[f8],        %[f15],     %[f7]     \n\t"
+    "madd.s    %[f10],      %[f10],       %[f15],     %[f5]     \n\t"
+#endif
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "sub.s     %[f11],      %[f11],       %[f8]                 \n\t"
+    "add.s     %[f12],      %[f12],       %[f8]                 \n\t"
+    "sub.s     %[f13],      %[f13],       %[f10]                \n\t"
+    "sub.s     %[f14],      %[f14],       %[f10]                \n\t"
+    "addiu     %[c2],       %[c2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        8                     \n\t"
+    "swc1      %[f11],      8(%[a1])                            \n\t"
+    "swc1      %[f12],      -8(%[a2])                           \n\t"
+    "swc1      %[f13],      12(%[a1])                           \n\t"
+    "swc1      %[f14],      -4(%[a2])                           \n\t"
+    "addiu     %[a1],       %[a1],        16                    \n\t"
+    "addiu     %[count],    %[count],     -1                    \n\t"
+    "bgtz      %[count],    1b                                  \n\t"
+    " addiu    %[a2],       %[a2],        -16                   \n\t"
+    ".set      pop                                              \n\t"
+    : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2),
+      [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4),
+      [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+      [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12),
+      [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15),
+      [count] "=&r" (count)
+    : [f0] "f" (f0)
+    : "memory"
+  );
+}
+
+static void rftbsub_128_mips(float* a) {
+  const float *c = rdft_w + 32;
+  const float f0 = 0.5f;
+  float* a1 = &a[2];
+  float* a2 = &a[126];
+  const float* c1 = &c[1];
+  const float* c2 = &c[31];
+  float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+  int count;
+
+  a[1] = -a[1];
+  a[65] = -a[65];
+
+  __asm __volatile (
+    ".set      push                                             \n\t"
+    ".set      noreorder                                        \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "addiu     %[count],    $zero,        15                    \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "add.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "sub.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "madd.s    %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "nmsub.s   %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f6],        %[f3]                 \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "addiu     %[a1],       %[a1],        8                     \n\t"
+    "addiu     %[a2],       %[a2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        4                     \n\t"
+    "addiu     %[c2],       %[c2],        -4                    \n\t"
+   "1:                                                          \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "lwc1      %[f10],      -4(%[c2])                           \n\t"
+    "lwc1      %[f11],      8(%[a1])                            \n\t"
+    "lwc1      %[f12],      -8(%[a2])                           \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "add.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "sub.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "madd.s    %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "nmsub.s   %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f10],      %[f0],        %[f10]                \n\t"
+    "sub.s     %[f5],       %[f11],       %[f12]                \n\t"
+    "add.s     %[f7],       %[f13],       %[f14]                \n\t"
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f6],        %[f3]                 \n\t"
+    "mul.s     %[f8],       %[f10],       %[f5]                 \n\t"
+    "mul.s     %[f10],      %[f10],       %[f7]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f9],       %[f15],       %[f7]                 \n\t"
+    "mul.s     %[f15],      %[f15],       %[f5]                 \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "add.s     %[f8],       %[f8],        %[f9]                 \n\t"
+    "sub.s     %[f10],      %[f10],       %[f15]                \n\t"
+#else
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "madd.s    %[f8],       %[f8],        %[f15],     %[f7]     \n\t"
+    "nmsub.s   %[f10],      %[f10],       %[f15],     %[f5]     \n\t"
+#endif
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "sub.s     %[f11],      %[f11],       %[f8]                 \n\t"
+    "add.s     %[f12],      %[f12],       %[f8]                 \n\t"
+    "sub.s     %[f13],      %[f10],       %[f13]                \n\t"
+    "sub.s     %[f14],      %[f10],       %[f14]                \n\t"
+    "addiu     %[c2],       %[c2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        8                     \n\t"
+    "swc1      %[f11],      8(%[a1])                            \n\t"
+    "swc1      %[f12],      -8(%[a2])                           \n\t"
+    "swc1      %[f13],      12(%[a1])                           \n\t"
+    "swc1      %[f14],      -4(%[a2])                           \n\t"
+    "addiu     %[a1],       %[a1],        16                    \n\t"
+    "addiu     %[count],    %[count],     -1                    \n\t"
+    "bgtz      %[count],    1b                                  \n\t"
+    " addiu    %[a2],       %[a2],        -16                   \n\t"
+    ".set      pop                                              \n\t"
+    : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2),
+      [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4),
+      [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+      [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12),
+      [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15),
+      [count] "=&r" (count)
+    : [f0] "f" (f0)
+    : "memory"
+  );
+}
+
 void aec_rdft_init_mips(void) {
+  cft1st_128 = cft1st_128_mips;
+  cftmdl_128 = cftmdl_128_mips;
+  rftfsub_128 = rftfsub_128_mips;
+  rftbsub_128 = rftbsub_128_mips;
   cftfsub_128 = cftfsub_128_mips;
+  cftbsub_128 = cftbsub_128_mips;
   bitrv2_128 = bitrv2_128_mips;
 }