Update Android.bp following XNNPACK rebase

Test: mm
Change-Id: Iee60aa7de9c417eaf55c334763a38697c9c9291d
diff --git a/Android.bp b/Android.bp
index 2575cf8..9220694 100644
--- a/Android.bp
+++ b/Android.bp
@@ -55,6 +55,7 @@
     "src/subgraph/depth-to-space.c",
     "src/subgraph/depthwise-convolution-2d.c",
     "src/subgraph/divide.c",
+    "src/subgraph/elu.c",
     "src/subgraph/floor.c",
     "src/subgraph/fully-connected.c",
     "src/subgraph/global-average-pooling-2d.c",
@@ -390,6 +391,18 @@
     "src/f32-vbinary/gen/vsubc-scalar-x2.c",
     "src/f32-vbinary/gen/vsubc-scalar-x4.c",
     "src/f32-vbinary/gen/vsubc-scalar-x8.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x1.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x2.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x3.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x5.c",
+    "src/f32-velu/gen/velu-scalar-rr2-lut16-p3-x6.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x1.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x2.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x3.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x5.c",
+    "src/f32-velu/gen/velu-scalar-rr2-p6-x6.c",
     "src/f32-vlrelu/gen/vlrelu-scalar-x1.c",
     "src/f32-vlrelu/gen/vlrelu-scalar-x2.c",
     "src/f32-vlrelu/gen/vlrelu-scalar-x4.c",
@@ -494,13 +507,6 @@
     "src/xx-copy/memcpy.c",
 ]
 
-PSIMD_ACCMATH_UKERNELS = [
-    "src/qs8-requantization/fp32-psimd.c",
-    "src/qs8-requantization/precise-psimd.c",
-    "src/qu8-requantization/fp32-psimd.c",
-    "src/qu8-requantization/precise-psimd.c",
-]
-
 // ISA-specific micro-kernels
 NEON_UKERNELS = [
     "src/f32-argmaxpool/4x-neon-c4.c",
@@ -518,6 +524,7 @@
     "src/f32-conv-hwc/gen/3x3s2p1c3x4-neon-2x2.c",
     "src/f32-conv-hwc/gen/3x3s2p1c3x8-neon-2x1.c",
     "src/f32-conv-hwc/gen/3x3s2p1c3x8-neon-2x2.c",
+    "src/f32-conv-hwc2chw/3x3s2p1c3x4-neon-2x2.c",
     "src/f32-dwconv/gen/up4x4-minmax-neon-acc2.c",
     "src/f32-dwconv/gen/up4x4-minmax-neon.c",
     "src/f32-dwconv/gen/up4x9-minmax-neon-acc2.c",
@@ -667,7 +674,6 @@
     "src/f32-relu/gen/neon-x4.c",
     "src/f32-relu/gen/neon-x8.c",
     "src/f32-rmax/neon.c",
-    "src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c",
     "src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x4.c",
     "src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x8.c",
     "src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x12.c",
@@ -686,6 +692,19 @@
     "src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x16.c",
     "src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x20.c",
     "src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x24.c",
+    "src/f32-spmm/gen/4x1-minmax-neon-pipelined.c",
+    "src/f32-spmm/gen/4x1-minmax-neon-x2.c",
+    "src/f32-spmm/gen/4x1-minmax-neon.c",
+    "src/f32-spmm/gen/8x1-minmax-neon-pipelined.c",
+    "src/f32-spmm/gen/8x1-minmax-neon-x2.c",
+    "src/f32-spmm/gen/8x1-minmax-neon.c",
+    "src/f32-spmm/gen/12x1-minmax-neon.c",
+    "src/f32-spmm/gen/16x1-minmax-neon-pipelined.c",
+    "src/f32-spmm/gen/16x1-minmax-neon-x2.c",
+    "src/f32-spmm/gen/16x1-minmax-neon.c",
+    "src/f32-spmm/gen/32x1-minmax-neon-pipelined.c",
+    "src/f32-spmm/gen/32x1-minmax-neon-x2.c",
+    "src/f32-spmm/gen/32x1-minmax-neon.c",
     "src/f32-vbinary/gen/vadd-minmax-neon-x4.c",
     "src/f32-vbinary/gen/vadd-minmax-neon-x8.c",
     "src/f32-vbinary/gen/vaddc-minmax-neon-x4.c",
@@ -712,6 +731,18 @@
     "src/f32-vbinary/gen/vsub-minmax-neon-x8.c",
     "src/f32-vbinary/gen/vsubc-minmax-neon-x4.c",
     "src/f32-vbinary/gen/vsubc-minmax-neon-x8.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-neon-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-neon-rr2-p6-x24.c",
     "src/f32-vlrelu/gen/vlrelu-neon-x4.c",
     "src/f32-vlrelu/gen/vlrelu-neon-x8.c",
     "src/f32-vmulcaddc/gen/c4-minmax-neon-2x.c",
@@ -739,10 +770,6 @@
     "src/math/roundu-neon-cvt.c",
     "src/math/roundz-neon-addsub.c",
     "src/math/roundz-neon-cvt.c",
-    "src/math/sigmoid-neon-frac-p9-p10-nr1recps.c",
-    "src/math/sigmoid-neon-rr1-lut64-p2-nr2recps.c",
-    "src/math/sigmoid-neon-rr1-lut2048-p1-nr2recps.c",
-    "src/math/sigmoid-neon-rr1-p5-nr2recps.c",
     "src/math/sigmoid-neon-rr2-lut64-p2-nr2recps.c",
     "src/math/sigmoid-neon-rr2-lut2048-p1-nr2recps.c",
     "src/math/sigmoid-neon-rr2-p5-nr2recps.c",
@@ -932,6 +959,31 @@
     "src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x16.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x20.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x24.c",
+    "src/f32-spmm/gen/4x1-minmax-neonfma-pipelined.c",
+    "src/f32-spmm/gen/4x1-minmax-neonfma-x2.c",
+    "src/f32-spmm/gen/4x1-minmax-neonfma.c",
+    "src/f32-spmm/gen/8x1-minmax-neonfma-pipelined.c",
+    "src/f32-spmm/gen/8x1-minmax-neonfma-x2.c",
+    "src/f32-spmm/gen/8x1-minmax-neonfma.c",
+    "src/f32-spmm/gen/12x1-minmax-neonfma.c",
+    "src/f32-spmm/gen/16x1-minmax-neonfma-pipelined.c",
+    "src/f32-spmm/gen/16x1-minmax-neonfma-x2.c",
+    "src/f32-spmm/gen/16x1-minmax-neonfma.c",
+    "src/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c",
+    "src/f32-spmm/gen/32x1-minmax-neonfma-x2.c",
+    "src/f32-spmm/gen/32x1-minmax-neonfma.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x4.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x8.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x12.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x16.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x20.c",
+    "src/f32-velu/gen/velu-neonfma-rr1-p6-x24.c",
     "src/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c",
     "src/f32-vmulcaddc/gen/c8-minmax-neonfma-2x.c",
     "src/f32-vsqrt/gen/neonfma-nr1rsqrts1fma1adj-x4.c",
@@ -956,11 +1008,11 @@
     "src/f32-vsqrt/gen/neonfma-nr2fma1adj-x40.c",
     "src/math/exp-neonfma-rr2-lut64-p2.c",
     "src/math/exp-neonfma-rr2-p5.c",
+    "src/math/expm1minus-neonfma-rr1-lut16-p3.c",
+    "src/math/expm1minus-neonfma-rr1-p6.c",
     "src/math/expminus-neonfma-rr2-lut64-p2.c",
     "src/math/expminus-neonfma-rr2-lut2048-p1.c",
     "src/math/expminus-neonfma-rr2-p5.c",
-    "src/math/expm1minus-neonfma-rr1-lut16-p3.c",
-    "src/math/expm1minus-neonfma-rr1-p6.c",
     "src/math/sigmoid-neonfma-rr1-lut64-p2-nr1recps1fma.c",
     "src/math/sigmoid-neonfma-rr1-lut64-p2-nr2fma.c",
     "src/math/sigmoid-neonfma-rr1-lut64-p2-nr2recps.c",
@@ -1075,27 +1127,14 @@
     "src/f32-sigmoid/gen/neonfma-rr1-p5-div-x16.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-div-x20.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-div-x24.c",
-    "src/f32-spmm/gen/4x1-minmax-neonfma-pipelined.c",
-    "src/f32-spmm/gen/4x1-minmax-neonfma-x2.c",
-    "src/f32-spmm/gen/4x1-minmax-neonfma.c",
     "src/f32-spmm/gen/4x2-minmax-neonfma.c",
     "src/f32-spmm/gen/4x4-minmax-neonfma.c",
-    "src/f32-spmm/gen/8x1-minmax-neonfma-pipelined.c",
-    "src/f32-spmm/gen/8x1-minmax-neonfma-x2.c",
-    "src/f32-spmm/gen/8x1-minmax-neonfma.c",
     "src/f32-spmm/gen/8x2-minmax-neonfma.c",
     "src/f32-spmm/gen/8x4-minmax-neonfma.c",
-    "src/f32-spmm/gen/12x1-minmax-neonfma.c",
     "src/f32-spmm/gen/12x2-minmax-neonfma.c",
     "src/f32-spmm/gen/12x4-minmax-neonfma.c",
-    "src/f32-spmm/gen/16x1-minmax-neonfma-pipelined.c",
-    "src/f32-spmm/gen/16x1-minmax-neonfma-x2.c",
-    "src/f32-spmm/gen/16x1-minmax-neonfma.c",
     "src/f32-spmm/gen/16x2-minmax-neonfma.c",
     "src/f32-spmm/gen/16x4-minmax-neonfma.c",
-    "src/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c",
-    "src/f32-spmm/gen/32x1-minmax-neonfma-x2.c",
-    "src/f32-spmm/gen/32x1-minmax-neonfma.c",
     "src/f32-spmm/gen/32x2-minmax-neonfma.c",
     "src/f32-spmm/gen/32x4-minmax-neonfma.c",
     "src/f32-vbinary/gen/vdiv-minmax-neon-x4.c",
@@ -1303,16 +1342,28 @@
     "src/f32-gemm/gen-inc/1x8inc-minmax-sse-dup.c",
     "src/f32-gemm/gen-inc/1x8inc-minmax-sse-load1.c",
     "src/f32-gemm/gen-inc/1x8s4inc-minmax-sse.c",
+    "src/f32-gemm/gen-inc/3x8inc-minmax-sse-dup.c",
+    "src/f32-gemm/gen-inc/3x8inc-minmax-sse-load1.c",
+    "src/f32-gemm/gen-inc/3x8s4inc-minmax-sse.c",
     "src/f32-gemm/gen-inc/4x8inc-minmax-sse-dup.c",
     "src/f32-gemm/gen-inc/4x8inc-minmax-sse-load1.c",
     "src/f32-gemm/gen-inc/4x8s4inc-minmax-sse.c",
+    "src/f32-gemm/gen-inc/5x8inc-minmax-sse-dup.c",
+    "src/f32-gemm/gen-inc/5x8inc-minmax-sse-load1.c",
+    "src/f32-gemm/gen-inc/5x8s4inc-minmax-sse.c",
     "src/f32-gemm/gen/1x8-minmax-sse-dup.c",
     "src/f32-gemm/gen/1x8-minmax-sse-load1.c",
     "src/f32-gemm/gen/1x8s4-minmax-sse.c",
+    "src/f32-gemm/gen/3x8-minmax-sse-dup.c",
+    "src/f32-gemm/gen/3x8-minmax-sse-load1.c",
+    "src/f32-gemm/gen/3x8s4-minmax-sse.c",
     "src/f32-gemm/gen/4x2c4-minmax-sse.c",
     "src/f32-gemm/gen/4x8-minmax-sse-dup.c",
     "src/f32-gemm/gen/4x8-minmax-sse-load1.c",
     "src/f32-gemm/gen/4x8s4-minmax-sse.c",
+    "src/f32-gemm/gen/5x8-minmax-sse-dup.c",
+    "src/f32-gemm/gen/5x8-minmax-sse-load1.c",
+    "src/f32-gemm/gen/5x8s4-minmax-sse.c",
     "src/f32-hswish/gen/hswish-sse-x4.c",
     "src/f32-hswish/gen/hswish-sse-x8.c",
     "src/f32-ibilinear/gen/sse-c4.c",
@@ -1320,10 +1371,16 @@
     "src/f32-igemm/gen/1x8-minmax-sse-dup.c",
     "src/f32-igemm/gen/1x8-minmax-sse-load1.c",
     "src/f32-igemm/gen/1x8s4-minmax-sse.c",
+    "src/f32-igemm/gen/3x8-minmax-sse-dup.c",
+    "src/f32-igemm/gen/3x8-minmax-sse-load1.c",
+    "src/f32-igemm/gen/3x8s4-minmax-sse.c",
     "src/f32-igemm/gen/4x2c4-minmax-sse.c",
     "src/f32-igemm/gen/4x8-minmax-sse-dup.c",
     "src/f32-igemm/gen/4x8-minmax-sse-load1.c",
     "src/f32-igemm/gen/4x8s4-minmax-sse.c",
+    "src/f32-igemm/gen/5x8-minmax-sse-dup.c",
+    "src/f32-igemm/gen/5x8-minmax-sse-load1.c",
+    "src/f32-igemm/gen/5x8s4-minmax-sse.c",
     "src/f32-maxpool/9p8x-minmax-sse-c4.c",
     "src/f32-pavgpool/9p8x-minmax-sse-c4.c",
     "src/f32-pavgpool/9x-minmax-sse-c4.c",
@@ -1397,6 +1454,18 @@
     "src/f32-argmaxpool/4x-sse2-c4.c",
     "src/f32-argmaxpool/9p8x-sse2-c4.c",
     "src/f32-argmaxpool/9x-sse2-c4.c",
+    "src/f32-gemm/gen-inc/1x8inc-minmax-sse2-dup.c",
+    "src/f32-gemm/gen-inc/3x8inc-minmax-sse2-dup.c",
+    "src/f32-gemm/gen-inc/4x8inc-minmax-sse2-dup.c",
+    "src/f32-gemm/gen-inc/5x8inc-minmax-sse2-dup.c",
+    "src/f32-gemm/gen/1x8-minmax-sse2-dup.c",
+    "src/f32-gemm/gen/3x8-minmax-sse2-dup.c",
+    "src/f32-gemm/gen/4x8-minmax-sse2-dup.c",
+    "src/f32-gemm/gen/5x8-minmax-sse2-dup.c",
+    "src/f32-igemm/gen/1x8-minmax-sse2-dup.c",
+    "src/f32-igemm/gen/3x8-minmax-sse2-dup.c",
+    "src/f32-igemm/gen/4x8-minmax-sse2-dup.c",
+    "src/f32-igemm/gen/5x8-minmax-sse2-dup.c",
     "src/f32-prelu/gen/sse2-2x4.c",
     "src/f32-prelu/gen/sse2-2x8.c",
     "src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c",
@@ -1423,6 +1492,18 @@
     "src/f32-sigmoid/gen/sse2-p5-div-x16.c",
     "src/f32-sigmoid/gen/sse2-p5-div-x20.c",
     "src/f32-sigmoid/gen/sse2-p5-div-x24.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-sse2-rr2-p6-x24.c",
     "src/f32-vlrelu/gen/vlrelu-sse2-x4.c",
     "src/f32-vlrelu/gen/vlrelu-sse2-x8.c",
     "src/f32-vrnd/gen/vrndd-sse2-x4.c",
@@ -1585,6 +1666,18 @@
     "src/f32-sigmoid/gen/sse41-p5-div-x16.c",
     "src/f32-sigmoid/gen/sse41-p5-div-x20.c",
     "src/f32-sigmoid/gen/sse41-p5-div-x24.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x4.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x12.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x20.c",
+    "src/f32-velu/gen/velu-sse41-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x4.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x12.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x20.c",
+    "src/f32-velu/gen/velu-sse41-rr2-p6-x24.c",
     "src/f32-vlrelu/gen/vlrelu-sse41-x4.c",
     "src/f32-vlrelu/gen/vlrelu-sse41-x8.c",
     "src/f32-vrnd/gen/vrndd-sse41-x4.c",
@@ -1757,6 +1850,24 @@
     "src/f32-vbinary/gen/vsub-minmax-avx-x16.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx-x8.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x8.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x24.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x32.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x40.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x48.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x8.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x24.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x32.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x40.c",
+    "src/f32-velu/gen/velu-avx-rr2-lut16-p3-x48.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x8.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x16.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x24.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x32.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x40.c",
+    "src/f32-velu/gen/velu-avx-rr2-p6-x48.c",
     "src/f32-vlrelu/gen/vlrelu-avx-x8.c",
     "src/f32-vlrelu/gen/vlrelu-avx-x16.c",
     "src/f32-vrnd/gen/vrndd-avx-x8.c",
@@ -1959,6 +2070,46 @@
     "src/f32-sigmoid/gen/avx2-rr1-p5-nr2fma-x64.c",
     "src/f32-sigmoid/gen/avx2-rr1-p5-nr2fma-x72.c",
     "src/f32-sigmoid/gen/avx2-rr1-p5-nr2fma-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut8-p4-perm-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-lut16-p3-gather-x80.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x8.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x16.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x24.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x32.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x40.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x48.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x56.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x64.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x72.c",
+    "src/f32-velu/gen/velu-avx2-rr1-p6-x80.c",
     "src/f32-vscaleexpminusmax/gen/avx2-p5-x8.c",
     "src/f32-vscaleexpminusmax/gen/avx2-p5-x16.c",
     "src/f32-vscaleexpminusmax/gen/avx2-p5-x24.c",
@@ -1986,11 +2137,11 @@
     "src/math/exp-avx2-rr2-lut8-p3-perm.c",
     "src/math/exp-avx2-rr2-lut8-p4-perm.c",
     "src/math/exp-avx2-rr2-p5.c",
-    "src/math/expminus-avx2-rr2-p5.c",
     "src/math/expm1minus-avx2-rr1-lut4-p4-perm.c",
     "src/math/expm1minus-avx2-rr1-lut8-p4-perm.c",
     "src/math/expm1minus-avx2-rr1-lut16-p3-gather.c",
     "src/math/expm1minus-avx2-rr1-p6.c",
+    "src/math/expminus-avx2-rr2-p5.c",
     "src/math/extexp-avx2-p5.c",
     "src/math/sigmoid-avx2-rr1-lut64-p2-gather-div.c",
     "src/math/sigmoid-avx2-rr1-lut64-p2-gather-nr1fma.c",
@@ -2187,6 +2338,22 @@
     "src/f32-vbinary/gen/vsub-minmax-avx512f-x32.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx512f-x16.c",
     "src/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x16.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x32.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x48.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x64.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x80.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x96.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x112.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x128.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x16.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x32.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x48.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x64.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x80.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x96.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x112.c",
+    "src/f32-velu/gen/velu-avx512f-rr1-p6-x128.c",
     "src/f32-vlrelu/gen/vlrelu-avx512f-x16.c",
     "src/f32-vlrelu/gen/vlrelu-avx512f-x32.c",
     "src/f32-vrnd/gen/vrndd-avx512f-x16.c",
@@ -2534,31 +2701,6 @@
 }
 
 cc_library_static {
-    name: "xnnpack_psimd_accmath_ukernels",
-    defaults: ["xnnpack_internal_default"],
-    srcs: PSIMD_ACCMATH_UKERNELS,
-    cflags: [
-        "-O3",
-    ],
-    arch: {
-        arm: {
-            cflags: [
-                "-marm",
-                "-mfpu=neon",
-            ],
-        },
-    },
-    header_libs: [
-        "fp16_headers",
-        "psimd_headers",
-    ],
-    static_libs: [
-        "libpthreadpool",
-        "xnnpack_tables",
-    ],
-}
-
-cc_library_static {
     name: "xnnpack_neon_ukernels",
     defaults: ["xnnpack_internal_default"],
     arch: {
@@ -2977,7 +3119,6 @@
     arch: {
         arm: {
             whole_static_libs: [
-                "xnnpack_psimd_accmath_ukernels",
                 "xnnpack_neon_ukernels",
                 "xnnpack_neonfma_ukernels",
                 "xnnpack_neonv8_ukernels",
@@ -2987,7 +3128,6 @@
         },
         arm64: {
             whole_static_libs: [
-                "xnnpack_psimd_accmath_ukernels",
                 "xnnpack_neon_ukernels",
                 "xnnpack_neonfma_ukernels",
                 "xnnpack_neonv8_ukernels",
@@ -2998,7 +3138,6 @@
         },
         x86: {
             whole_static_libs: [
-                "xnnpack_psimd_accmath_ukernels",
                 "xnnpack_sse2_ukernels",
                 "xnnpack_ssse3_ukernels",
                 "xnnpack_sse41_ukernels",
@@ -3012,7 +3151,6 @@
         },
         x86_64: {
             whole_static_libs: [
-                "xnnpack_psimd_accmath_ukernels",
                 "xnnpack_sse2_ukernels",
                 "xnnpack_ssse3_ukernels",
                 "xnnpack_sse41_ukernels",