kyber/dilithium aarch64 pull from pqclean + patches (#1512)

* fix compiler warning/error in aes256_armv8.c

* pull pqclean+paches

* pull pqclean+paches

* remove old patches & update algorithm md/yml

* add new patches

* add patch with fixes for arm/kyber768+kyber1024

* update licenses in yamls and mds

* update kyber/dil suppression files

* removes superfluous pqclean Makefiles & updates copy_from_upstream script to handle this case

* update license infos
diff --git a/README.md b/README.md
index bcd4e46..ef415c6 100644
--- a/README.md
+++ b/README.md
@@ -173,9 +173,9 @@
 - `src/kem/bike/additional`: Apache License v2.0
 - `src/kem/classic_mceliece/pqclean_*`: public domain
 - `src/kem/kyber/pqcrystals-*`: public domain (CC0) or Apache License v2.0
-- `src/kem/kyber/pqclean_*`: public domain
+- `src/kem/kyber/pqclean_*`: public domain (CC0), and public domain (CC0) or Apache License v2.0, and public domain (CC0) or MIT, and MIT
 - `src/sig/dilithium/pqcrystals-*`: public domain (CC0) or Apache License v2.0
-- `src/sig/dilithium/pqclean_*`: public domain
+- `src/sig/dilithium/pqclean_*`: public domain (CC0), and public domain (CC0) or Apache License v2.0, and public domain (CC0) or MIT, and MIT
 - `src/sig/sphincs/pqclean_*`: CC0 (public domain)
 
 ## Acknowledgements
diff --git a/docs/algorithms/kem/classic_mceliece.md b/docs/algorithms/kem/classic_mceliece.md
index 1d4df9c..0467853 100644
--- a/docs/algorithms/kem/classic_mceliece.md
+++ b/docs/algorithms/kem/classic_mceliece.md
@@ -6,7 +6,7 @@
 - **Authors' website**: https://classic.mceliece.org
 - **Specification version**: SUPERCOP-20221025.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+  - **Source**: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
   - **Implementation license (SPDX-Identifier)**: Public domain
 - **Ancestors of primary source**:
   - SUPERCOP-20221025 "clean" and "avx2" implementations
diff --git a/docs/algorithms/kem/classic_mceliece.yml b/docs/algorithms/kem/classic_mceliece.yml
index 71565b4..5f2ca92 100644
--- a/docs/algorithms/kem/classic_mceliece.yml
+++ b/docs/algorithms/kem/classic_mceliece.yml
@@ -375,4 +375,4 @@
 auxiliary-submitters: []
 primary-upstream:
   spdx-license-identifier: Public domain
-  source: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+  source: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
diff --git a/docs/algorithms/kem/hqc.md b/docs/algorithms/kem/hqc.md
index 2416c00..f022eaf 100644
--- a/docs/algorithms/kem/hqc.md
+++ b/docs/algorithms/kem/hqc.md
@@ -6,7 +6,7 @@
 - **Authors' website**: https://pqc-hqc.org/
 - **Specification version**: NIST Round 3 submission.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+  - **Source**: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
   - **Implementation license (SPDX-Identifier)**: Public domain
 - **Ancestors of primary source**:
   - https://github.com/jschanck/package-pqclean/tree/29f79e72/hqc, which takes it from:
diff --git a/docs/algorithms/kem/hqc.yml b/docs/algorithms/kem/hqc.yml
index 8c090ad..227f246 100644
--- a/docs/algorithms/kem/hqc.yml
+++ b/docs/algorithms/kem/hqc.yml
@@ -125,4 +125,4 @@
     upstream: primary-upstream
 primary-upstream:
   spdx-license-identifier: Public domain
-  source: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+  source: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
diff --git a/docs/algorithms/kem/kyber.md b/docs/algorithms/kem/kyber.md
index d03eb13..0e36507 100644
--- a/docs/algorithms/kem/kyber.md
+++ b/docs/algorithms/kem/kyber.md
@@ -11,8 +11,8 @@
   - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
 - **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/518de2414a85052bb91349bcbcc347f391292d5b with copy_from_upstream patches
   - **pqclean-aarch64**:<a name="pqclean-aarch64"></a>
-      - **Source**: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07 with copy_from_upstream patches
-      - **Implementation license (SPDX-Identifier)**: CC0-1.0
+      - **Source**: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e with copy_from_upstream patches
+      - **Implementation license (SPDX-Identifier)**: CC0-1.0 and (CC0-1.0 or Apache-2.0) and (CC0-1.0 or MIT) and MIT
 
 
 ## Parameter set summary
diff --git a/docs/algorithms/kem/kyber.yml b/docs/algorithms/kem/kyber.yml
index bd8afff..36a938c 100644
--- a/docs/algorithms/kem/kyber.yml
+++ b/docs/algorithms/kem/kyber.yml
@@ -22,9 +22,9 @@
   spdx-license-identifier: CC0-1.0 or Apache-2.0
 optimized-upstreams:
   pqclean-aarch64:
-    source: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+    source: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
       with copy_from_upstream patches
-    spdx-license-identifier: CC0-1.0
+    spdx-license-identifier: CC0-1.0 and (CC0-1.0 or Apache-2.0) and (CC0-1.0 or MIT) and MIT
 parameter-sets:
 - name: Kyber512
   claimed-nist-level: 1
diff --git a/docs/algorithms/sig/dilithium.md b/docs/algorithms/sig/dilithium.md
index ac6c1b7..7110a51 100644
--- a/docs/algorithms/sig/dilithium.md
+++ b/docs/algorithms/sig/dilithium.md
@@ -11,8 +11,8 @@
   - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
 - **Optimized Implementation sources**: https://github.com/pq-crystals/dilithium/commit/3e9b9f1412f6c7435dbeb4e10692ea58f181ee51 with copy_from_upstream patches
   - **pqclean-aarch64**:<a name="pqclean-aarch64"></a>
-      - **Source**: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07 with copy_from_upstream patches
-      - **Implementation license (SPDX-Identifier)**: CC0-1.0
+      - **Source**: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e with copy_from_upstream patches
+      - **Implementation license (SPDX-Identifier)**: CC0-1.0 and (CC0-1.0 or Apache-2.0) and (CC0-1.0 or MIT) and MIT
 
 
 ## Parameter set summary
diff --git a/docs/algorithms/sig/dilithium.yml b/docs/algorithms/sig/dilithium.yml
index 65ea9a4..51282af 100644
--- a/docs/algorithms/sig/dilithium.yml
+++ b/docs/algorithms/sig/dilithium.yml
@@ -20,9 +20,9 @@
   spdx-license-identifier: CC0-1.0 or Apache-2.0
 optimized-upstreams:
   pqclean-aarch64:
-    source: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+    source: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
       with copy_from_upstream patches
-    spdx-license-identifier: CC0-1.0
+    spdx-license-identifier: CC0-1.0 and (CC0-1.0 or Apache-2.0) and (CC0-1.0 or MIT) and MIT
 parameter-sets:
 - name: Dilithium2
   oqs_alg: OQS_SIG_alg_dilithium_2
diff --git a/docs/algorithms/sig/falcon.md b/docs/algorithms/sig/falcon.md
index c82d55a..50fbd10 100644
--- a/docs/algorithms/sig/falcon.md
+++ b/docs/algorithms/sig/falcon.md
@@ -7,7 +7,7 @@
 - **Authors' website**: https://falcon-sign.info
 - **Specification version**: 20211101.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+  - **Source**: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
   - **Implementation license (SPDX-Identifier)**: MIT
 
 
diff --git a/docs/algorithms/sig/falcon.yml b/docs/algorithms/sig/falcon.yml
index f7b2fda..c578394 100644
--- a/docs/algorithms/sig/falcon.yml
+++ b/docs/algorithms/sig/falcon.yml
@@ -18,7 +18,7 @@
 nist-round: 3
 spec-version: 20211101
 primary-upstream:
-  source: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+  source: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
   spdx-license-identifier: MIT
   upstream-ancestors:
   - https://www.falcon-sign.info
diff --git a/docs/algorithms/sig/sphincs.md b/docs/algorithms/sig/sphincs.md
index 60ba737..098ba03 100644
--- a/docs/algorithms/sig/sphincs.md
+++ b/docs/algorithms/sig/sphincs.md
@@ -7,7 +7,7 @@
 - **Authors' website**: https://sphincs.org/
 - **Specification version**: NIST Round 3 submission, v3.1 (June 10, 2022).
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07 with copy_from_upstream patches
+  - **Source**: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e with copy_from_upstream patches
   - **Implementation license (SPDX-Identifier)**: CC0-1.0
 
 
diff --git a/docs/algorithms/sig/sphincs.yml b/docs/algorithms/sig/sphincs.yml
index 3cd78b3..90fc816 100644
--- a/docs/algorithms/sig/sphincs.yml
+++ b/docs/algorithms/sig/sphincs.yml
@@ -26,7 +26,7 @@
 spec-version: NIST Round 3 submission, v3.1 (June 10, 2022)
 spdx-license-identifier: CC0-1.0
 primary-upstream:
-  source: https://github.com/PQClean/PQClean/commit/d742438e5c541958bfd58070cd8668d757d88e07
+  source: https://github.com/PQClean/PQClean/commit/66e50172055aaf1b9a16d8f35fe03b0807f2723e
     with copy_from_upstream patches
   spdx-license-identifier: CC0-1.0
   upstream-ancestors:
diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py
index e82d094..99d04c6 100755
--- a/scripts/copy_from_upstream/copy_from_upstream.py
+++ b/scripts/copy_from_upstream/copy_from_upstream.py
@@ -462,11 +462,17 @@
 
 
     try:
+        ul = scheme['upstream_location']
+        if 'arch_specific_upstream_locations' in family and impl in family['arch_specific_upstream_locations']:
+            ul = family['arch_specific_upstream_locations'][impl]
+        elif 'arch_specific_upstream_locations' in scheme and impl in scheme['arch_specific_upstream_locations']:
+            ul = scheme['arch_specific_upstream_locations'][impl]
+        
         os.remove(os.path.join(dst_basedir, 'src', family['type'], family['name'],
-                               '{}_{}_{}'.format(scheme['upstream_location'], scheme['pqclean_scheme'], impl),
+                               '{}_{}_{}'.format(ul, scheme['pqclean_scheme'], impl),
                                'Makefile'))
         os.remove(os.path.join(dst_basedir, 'src', family['type'], family['name'],
-                               '{}_{}_{}'.format(scheme['upstream_location'], scheme['pqclean_scheme'], impl),
+                               '{}_{}_{}'.format(ul, scheme['pqclean_scheme'], impl),
                                'Makefile.Microsoft_nmake'))
     except FileNotFoundError:
         pass
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index 134c6fb..47cd76f 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -3,12 +3,12 @@
     name: pqclean
     git_url: https://github.com/PQClean/PQClean.git
     git_branch: master
-    git_commit: d742438e5c541958bfd58070cd8668d757d88e07
+    git_commit: 66e50172055aaf1b9a16d8f35fe03b0807f2723e
     kem_meta_path: 'crypto_kem/{pqclean_scheme}/META.yml'
     sig_meta_path: 'crypto_sign/{pqclean_scheme}/META.yml'
     kem_scheme_path: 'crypto_kem/{pqclean_scheme}'
     sig_scheme_path: 'crypto_sign/{pqclean_scheme}'
-    patches: [pqclean-sphincs.patch, pqclean-kyber-armneon-shake.patch, pqclean-kyber-arm-macos-gcc-fix.patch, pqclean-dilithium-arm-randomized-signing.patch, pqclean-classicmceliece.patch]
+    patches: [pqclean-sphincs.patch, pqclean-dilithium-arm-randomized-signing.patch, pqclean-dilithium-symbolnames.patch, pqclean-kyber-armneon-shake-fixes.patch, pqclean-kyber-armneon-768-1024-fixes.patch, pqclean-classicmceliece.patch]
     ignore: pqclean_sphincs-shake-256s-simple_aarch64, pqclean_sphincs-shake-256s-simple_aarch64, pqclean_sphincs-shake-256f-simple_aarch64, pqclean_sphincs-shake-192s-simple_aarch64, pqclean_sphincs-shake-192f-simple_aarch64, pqclean_sphincs-shake-128s-simple_aarch64, pqclean_sphincs-shake-128f-simple_aarch64
   -
     name: pqcrystals-kyber
diff --git a/scripts/copy_from_upstream/patches/pqclean-dilithium-symbolnames.patch b/scripts/copy_from_upstream/patches/pqclean-dilithium-symbolnames.patch
new file mode 100644
index 0000000..a3097ac
--- /dev/null
+++ b/scripts/copy_from_upstream/patches/pqclean-dilithium-symbolnames.patch
@@ -0,0 +1,1390 @@
+diff --git b/crypto_sign/dilithium2/aarch64/__asm_NTT.S a/crypto_sign/dilithium2/aarch64/__asm_NTT.S
+index b244cabc..946c3c3c 100644
+--- b/crypto_sign/dilithium2/aarch64/__asm_NTT.S
++++ a/crypto_sign/dilithium2/aarch64/__asm_NTT.S
+@@ -28,10 +28,10 @@
+ #include "macros.inc"
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_top
+-.global _PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_top
+-PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_top:
+-_PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_top:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top
++PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top:
+ 
+     push_all
+     Q         .req w20
+@@ -206,10 +206,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_top:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_bot
+-.global _PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_bot
+-PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_bot:
+-_PQCLEAN_dilithium2_AARCH64__asm_ntt_SIMD_bot:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot
++PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot:
+ 
+     push_all
+     Q         .req w20
+diff --git b/crypto_sign/dilithium2/aarch64/__asm_iNTT.S a/crypto_sign/dilithium2/aarch64/__asm_iNTT.S
+index f28bb313..56a5b7ae 100644
+--- b/crypto_sign/dilithium2/aarch64/__asm_iNTT.S
++++ a/crypto_sign/dilithium2/aarch64/__asm_iNTT.S
+@@ -28,10 +28,10 @@
+ #include "macros.inc"
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_top
+-.global _PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_top
+-PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_top:
+-_PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_top:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top
++PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top:
+ 
+     push_all
+     Q         .req w20
+@@ -435,10 +435,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_top:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_bot
+-.global _PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_bot
+-PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_bot:
+-_PQCLEAN_dilithium2_AARCH64__asm_intt_SIMD_bot:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot
++PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot:
+ 
+     push_all
+     Q         .req w20
+diff --git b/crypto_sign/dilithium2/aarch64/__asm_poly.S a/crypto_sign/dilithium2/aarch64/__asm_poly.S
+index 63295ca0..e7fe8388 100644
+--- b/crypto_sign/dilithium2/aarch64/__asm_poly.S
++++ a/crypto_sign/dilithium2/aarch64/__asm_poly.S
+@@ -29,10 +29,10 @@
+ #include "params.h"
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_10_to_32
+-.global _PQCLEAN_dilithium2_AARCH64__asm_10_to_32
+-PQCLEAN_dilithium2_AARCH64__asm_10_to_32:
+-_PQCLEAN_dilithium2_AARCH64__asm_10_to_32:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32
++PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32:
+ 
+     mov x7, #16
+     _10_to_32_loop:
+@@ -102,10 +102,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_10_to_32:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_reduce
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_reduce
+-PQCLEAN_dilithium2_AARCH64__asm_poly_reduce:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_reduce:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce:
+ 
+     ldr w4, [x1]
+ 
+@@ -195,10 +195,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_reduce:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_caddq
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_caddq
+-PQCLEAN_dilithium2_AARCH64__asm_poly_caddq:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_caddq:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq:
+ 
+     ldr w4, [x1]
+ 
+@@ -288,10 +288,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_caddq:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_freeze
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_freeze
+-PQCLEAN_dilithium2_AARCH64__asm_poly_freeze:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_freeze:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze:
+ 
+     ldr w4, [x1]
+ 
+@@ -417,10 +417,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_freeze:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_power2round
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_power2round
+-PQCLEAN_dilithium2_AARCH64__asm_poly_power2round:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_power2round:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round:
+ 
+     mov w4, #1
+ 
+@@ -563,10 +563,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_power2round:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_add
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_add
+-PQCLEAN_dilithium2_AARCH64__asm_poly_add:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_add:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add:
+ 
+     ld1 {v0.4S}, [x1], #16
+     ld1 {v4.4S}, [x2], #16
+@@ -612,10 +612,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_add:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_sub
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_sub
+-PQCLEAN_dilithium2_AARCH64__asm_poly_sub:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_sub:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub:
+ 
+     ld1 {v0.4S}, [x1], #16
+     ld1 {v4.4S}, [x2], #16
+@@ -661,10 +661,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_sub:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_shiftl
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_shiftl
+-PQCLEAN_dilithium2_AARCH64__asm_poly_shiftl:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_shiftl:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl:
+ 
+     add x1, x0, #0
+ 
+@@ -728,10 +728,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_shiftl:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_poly_pointwise_montgomery
+-.global _PQCLEAN_dilithium2_AARCH64__asm_poly_pointwise_montgomery
+-PQCLEAN_dilithium2_AARCH64__asm_poly_pointwise_montgomery:
+-_PQCLEAN_dilithium2_AARCH64__asm_poly_pointwise_montgomery:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery
++PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery:
+ 
+     push_all
+ 
+@@ -847,10 +847,10 @@ _PQCLEAN_dilithium2_AARCH64__asm_poly_pointwise_montgomery:
+ 
+ 
+ .align 2
+-.global PQCLEAN_dilithium2_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+-.global _PQCLEAN_dilithium2_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+-PQCLEAN_dilithium2_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+-_PQCLEAN_dilithium2_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
++.global PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery
++.global _PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery
++PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
++_PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+ 
+     push_all
+ 
+diff --git b/crypto_sign/dilithium2/aarch64/api.h a/crypto_sign/dilithium2/aarch64/api.h
+index 3f7b3cb0..2ce42599 100644
+--- b/crypto_sign/dilithium2/aarch64/api.h
++++ a/crypto_sign/dilithium2/aarch64/api.h
+@@ -5,33 +5,33 @@
+  * or public domain at https://github.com/pq-crystals/dilithium
+  */
+ 
+-#ifndef PQCLEAN_dilithium2_AARCH64_API_H
+-#define PQCLEAN_dilithium2_AARCH64_API_H
++#ifndef PQCLEAN_DILITHIUM2_AARCH64_API_H
++#define PQCLEAN_DILITHIUM2_AARCH64_API_H
+ 
+ 
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+-#define PQCLEAN_dilithium2_AARCH64_CRYPTO_PUBLICKEYBYTES 1312
+-#define PQCLEAN_dilithium2_AARCH64_CRYPTO_SECRETKEYBYTES 2528
+-#define PQCLEAN_dilithium2_AARCH64_CRYPTO_BYTES 2420
+-#define PQCLEAN_dilithium2_AARCH64_CRYPTO_ALGNAME "Dilithium2"
++#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES 1312
++#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES 2528
++#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES 2420
++#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_ALGNAME "Dilithium2"
+ 
+-int PQCLEAN_dilithium2_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
++int PQCLEAN_DILITHIUM2_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+ 
+-int PQCLEAN_dilithium2_AARCH64_crypto_sign_signature(
++int PQCLEAN_DILITHIUM2_AARCH64_crypto_sign_signature(
+     uint8_t *sig, size_t *siglen,
+     const uint8_t *m, size_t mlen, const uint8_t *sk);
+ 
+-int PQCLEAN_dilithium2_AARCH64_crypto_sign_verify(
++int PQCLEAN_DILITHIUM2_AARCH64_crypto_sign_verify(
+     const uint8_t *sig, size_t siglen,
+     const uint8_t *m, size_t mlen, const uint8_t *pk);
+ 
+-int PQCLEAN_dilithium2_AARCH64_crypto_sign(
++int PQCLEAN_DILITHIUM2_AARCH64_crypto_sign(
+     uint8_t *sm, size_t *smlen,
+     const uint8_t *m, size_t mlen, const uint8_t *sk);
+ 
+-int PQCLEAN_dilithium2_AARCH64_crypto_sign_open(
++int PQCLEAN_DILITHIUM2_AARCH64_crypto_sign_open(
+     uint8_t *m, size_t *mlen,
+     const uint8_t *sm, size_t smlen, const uint8_t *pk);
+ 
+diff --git b/crypto_sign/dilithium2/aarch64/feat.S a/crypto_sign/dilithium2/aarch64/feat.S
+index a7f9a2e2..63be5df6 100644
+--- b/crypto_sign/dilithium2/aarch64/feat.S
++++ a/crypto_sign/dilithium2/aarch64/feat.S
+@@ -123,10 +123,10 @@ SOFTWARE.
+ .endm
+ 
+ .align 4
+-.global PQCLEAN_dilithium2_AARCH64_f1600x2
+-.global _PQCLEAN_dilithium2_AARCH64_f1600x2
+-PQCLEAN_dilithium2_AARCH64_f1600x2:
+-_PQCLEAN_dilithium2_AARCH64_f1600x2:
++.global PQCLEAN_DILITHIUM2_AARCH64_f1600x2
++.global _PQCLEAN_DILITHIUM2_AARCH64_f1600x2
++PQCLEAN_DILITHIUM2_AARCH64_f1600x2:
++_PQCLEAN_DILITHIUM2_AARCH64_f1600x2:
+     stp d8,  d9,  [sp,#-16]!
+     stp d10, d11, [sp,#-16]!
+     stp d12, d13, [sp,#-16]!
+diff --git b/crypto_sign/dilithium2/aarch64/fips202x2.c a/crypto_sign/dilithium2/aarch64/fips202x2.c
+index 446a91cc..f2faa493 100644
+--- b/crypto_sign/dilithium2/aarch64/fips202x2.c
++++ a/crypto_sign/dilithium2/aarch64/fips202x2.c
+@@ -101,12 +101,12 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = {
+ *
+ * Arguments:   - uint64_t *state: pointer to input/output Keccak state
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_f1600x2(v128*, const uint64_t*);
++extern void PQCLEAN_DILITHIUM2_AARCH64_f1600x2(v128*, const uint64_t*);
+ static inline
+ void KeccakF1600_StatePermutex2(v128 state[25])
+ {
+ #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+-  PQCLEAN_dilithium2_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
++  PQCLEAN_DILITHIUM2_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+ #else
+   v128 Aba, Abe, Abi, Abo, Abu;
+   v128 Aga, Age, Agi, Ago, Agu;
+diff --git b/crypto_sign/dilithium2/aarch64/ntt.h a/crypto_sign/dilithium2/aarch64/ntt.h
+index 60a07094..5543e95d 100644
+--- b/crypto_sign/dilithium2/aarch64/ntt.h
++++ a/crypto_sign/dilithium2/aarch64/ntt.h
+@@ -36,20 +36,20 @@
+ #include "params.h"
+ #include <stdint.h>
+ 
+-extern void PQCLEAN_dilithium2_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
+-extern void PQCLEAN_dilithium2_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
+ 
+-extern void PQCLEAN_dilithium2_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
+-extern void PQCLEAN_dilithium2_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
+ 
+ #define NTT(in) { \
+-        PQCLEAN_dilithium2_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+-        PQCLEAN_dilithium2_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+     }
+ 
+ #define iNTT(in) { \
+-        PQCLEAN_dilithium2_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+-        PQCLEAN_dilithium2_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+     }
+ 
+ #define ntt DILITHIUM_NAMESPACE(ntt)
+diff --git b/crypto_sign/dilithium2/aarch64/params.h a/crypto_sign/dilithium2/aarch64/params.h
+index c425b400..2f121ab4 100644
+--- b/crypto_sign/dilithium2/aarch64/params.h
++++ a/crypto_sign/dilithium2/aarch64/params.h
+@@ -12,8 +12,8 @@
+ //#define DILITHIUM_MODE 3
+ //#define DILITHIUM_MODE 5
+ 
+-#define CRYPTO_NAMESPACETOP PQCLEAN_dilithium2_AARCH64_crypto_sign
+-#define CRYPTO_NAMESPACE(s) PQCLEAN_dilithium2_AARCH64_##s
++#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM2_AARCH64_crypto_sign
++#define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM2_AARCH64_##s
+ #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP
+ #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s)
+ 
+diff --git b/crypto_sign/dilithium2/aarch64/poly.c a/crypto_sign/dilithium2/aarch64/poly.c
+index 838dfe63..f6f303a4 100644
+--- b/crypto_sign/dilithium2/aarch64/poly.c
++++ a/crypto_sign/dilithium2/aarch64/poly.c
+@@ -57,11 +57,11 @@ static const int32_t montgomery_const[4] = {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_asm_poly_reduce(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(int32_t *, const int32_t *);
+ void poly_reduce(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium2_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -74,11 +74,11 @@ void poly_reduce(poly *a) {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_asm_poly_caddq(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(int32_t *, const int32_t *);
+ void poly_caddq(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium2_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -91,11 +91,11 @@ void poly_caddq(poly *a) {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_asm_poly_freeze(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(int32_t *, const int32_t *);
+ void poly_freeze(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium2_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -205,11 +205,11 @@ void poly_invntt_tomont(poly *a) {
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
+ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium2_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tmul);
+ }
+@@ -226,11 +226,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+ *              - poly *a0: pointer to output polynomial with coefficients c0
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
+ void poly_power2round(poly *a1, poly *a0, const poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium2_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
++    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
+ 
+     DBENCH_STOP(*tround);
+ }
+@@ -738,11 +738,11 @@ void polyt1_pack(uint8_t *r, const poly *a) {
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_asm_10_to_32(int32_t *, const uint8_t *);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(int32_t *, const uint8_t *);
+ void polyt1_unpack(poly *r, const uint8_t *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium2_AARCH64_asm_10_to_32(r->coeffs, a);
++    PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(r->coeffs, a);
+ 
+     DBENCH_STOP(*tpack);
+ }
+diff --git b/crypto_sign/dilithium2/aarch64/polyvec.c a/crypto_sign/dilithium2/aarch64/polyvec.c
+index 12c9f2df..83fb05ef 100644
+--- b/crypto_sign/dilithium2/aarch64/polyvec.c
++++ a/crypto_sign/dilithium2/aarch64/polyvec.c
+@@ -178,11 +178,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve
+ *              - const polyvecl *u: pointer to first input vector
+ *              - const polyvecl *v: pointer to second input vector
+ **************************************************/
+-extern void PQCLEAN_dilithium2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
+ void polyvecl_pointwise_acc_montgomery(poly *w,
+                                        const polyvecl *u,
+                                        const polyvecl *v) {
+-    PQCLEAN_dilithium2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
++    PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
+ }
+ 
+ /*************************************************
+diff --git b/crypto_sign/dilithium3/aarch64/__asm_NTT.S a/crypto_sign/dilithium3/aarch64/__asm_NTT.S
+index 8704d272..96d96515 100644
+--- b/crypto_sign/dilithium3/aarch64/__asm_NTT.S
++++ a/crypto_sign/dilithium3/aarch64/__asm_NTT.S
+@@ -28,10 +28,10 @@
+ #include "macros.inc"
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_top
+-.global _PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_top
+-PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_top:
+-_PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_top:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top
++PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top:
+ 
+     push_all
+     Q         .req w20
+@@ -206,10 +206,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_top:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_bot
+-.global _PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_bot
+-PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_bot:
+-_PQCLEAN_dilithium3_AARCH64__asm_ntt_SIMD_bot:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot
++PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot:
+ 
+     push_all
+     Q         .req w20
+diff --git b/crypto_sign/dilithium3/aarch64/__asm_iNTT.S a/crypto_sign/dilithium3/aarch64/__asm_iNTT.S
+index 32b91567..119f7521 100644
+--- b/crypto_sign/dilithium3/aarch64/__asm_iNTT.S
++++ a/crypto_sign/dilithium3/aarch64/__asm_iNTT.S
+@@ -28,10 +28,10 @@
+ #include "macros.inc"
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_top
+-.global _PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_top
+-PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_top:
+-_PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_top:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top
++PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top:
+ 
+     push_all
+     Q         .req w20
+@@ -435,10 +435,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_top:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_bot
+-.global _PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_bot
+-PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_bot:
+-_PQCLEAN_dilithium3_AARCH64__asm_intt_SIMD_bot:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot
++PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot:
+ 
+     push_all
+     Q         .req w20
+diff --git b/crypto_sign/dilithium3/aarch64/__asm_poly.S a/crypto_sign/dilithium3/aarch64/__asm_poly.S
+index 0336064c..ed888fd1 100644
+--- b/crypto_sign/dilithium3/aarch64/__asm_poly.S
++++ a/crypto_sign/dilithium3/aarch64/__asm_poly.S
+@@ -29,10 +29,10 @@
+ #include "params.h"
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_10_to_32
+-.global _PQCLEAN_dilithium3_AARCH64__asm_10_to_32
+-PQCLEAN_dilithium3_AARCH64__asm_10_to_32:
+-_PQCLEAN_dilithium3_AARCH64__asm_10_to_32:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32
++PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32:
+ 
+     mov x7, #16
+     _10_to_32_loop:
+@@ -102,10 +102,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_10_to_32:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_reduce
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_reduce
+-PQCLEAN_dilithium3_AARCH64__asm_poly_reduce:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_reduce:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce:
+ 
+     ldr w4, [x1]
+ 
+@@ -195,10 +195,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_reduce:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_caddq
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_caddq
+-PQCLEAN_dilithium3_AARCH64__asm_poly_caddq:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_caddq:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq:
+ 
+     ldr w4, [x1]
+ 
+@@ -288,10 +288,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_caddq:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_freeze
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_freeze
+-PQCLEAN_dilithium3_AARCH64__asm_poly_freeze:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_freeze:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze:
+ 
+     ldr w4, [x1]
+ 
+@@ -417,10 +417,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_freeze:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_power2round
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_power2round
+-PQCLEAN_dilithium3_AARCH64__asm_poly_power2round:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_power2round:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round:
+ 
+     mov w4, #1
+ 
+@@ -563,10 +563,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_power2round:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_add
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_add
+-PQCLEAN_dilithium3_AARCH64__asm_poly_add:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_add:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add:
+ 
+     ld1 {v0.4S}, [x1], #16
+     ld1 {v4.4S}, [x2], #16
+@@ -612,10 +612,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_add:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_sub
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_sub
+-PQCLEAN_dilithium3_AARCH64__asm_poly_sub:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_sub:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub:
+ 
+     ld1 {v0.4S}, [x1], #16
+     ld1 {v4.4S}, [x2], #16
+@@ -661,10 +661,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_sub:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_shiftl
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_shiftl
+-PQCLEAN_dilithium3_AARCH64__asm_poly_shiftl:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_shiftl:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl:
+ 
+     add x1, x0, #0
+ 
+@@ -728,10 +728,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_shiftl:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_poly_pointwise_montgomery
+-.global _PQCLEAN_dilithium3_AARCH64__asm_poly_pointwise_montgomery
+-PQCLEAN_dilithium3_AARCH64__asm_poly_pointwise_montgomery:
+-_PQCLEAN_dilithium3_AARCH64__asm_poly_pointwise_montgomery:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery
++PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery:
+ 
+     push_all
+ 
+@@ -847,10 +847,10 @@ _PQCLEAN_dilithium3_AARCH64__asm_poly_pointwise_montgomery:
+ 
+ 
+ .align 2
+-.global PQCLEAN_dilithium3_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+-.global _PQCLEAN_dilithium3_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+-PQCLEAN_dilithium3_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+-_PQCLEAN_dilithium3_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
++.global PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery
++.global _PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery
++PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
++_PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+ 
+     push_all
+ 
+diff --git b/crypto_sign/dilithium3/aarch64/api.h a/crypto_sign/dilithium3/aarch64/api.h
+index 13d9d8cf..6a056440 100644
+--- b/crypto_sign/dilithium3/aarch64/api.h
++++ a/crypto_sign/dilithium3/aarch64/api.h
+@@ -5,33 +5,33 @@
+  * or public domain at https://github.com/pq-crystals/dilithium
+  */
+ 
+-#ifndef PQCLEAN_dilithium3_AARCH64_API_H
+-#define PQCLEAN_dilithium3_AARCH64_API_H
++#ifndef PQCLEAN_DILITHIUM3_AARCH64_API_H
++#define PQCLEAN_DILITHIUM3_AARCH64_API_H
+ 
+ 
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+-#define PQCLEAN_dilithium3_AARCH64_CRYPTO_PUBLICKEYBYTES 1952
+-#define PQCLEAN_dilithium3_AARCH64_CRYPTO_SECRETKEYBYTES 4000
+-#define PQCLEAN_dilithium3_AARCH64_CRYPTO_BYTES 3293
+-#define PQCLEAN_dilithium3_AARCH64_CRYPTO_ALGNAME "Dilithium3"
++#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES 1952
++#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES 4000
++#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES 3293
++#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_ALGNAME "Dilithium3"
+ 
+-int PQCLEAN_dilithium3_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
++int PQCLEAN_DILITHIUM3_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+ 
+-int PQCLEAN_dilithium3_AARCH64_crypto_sign_signature(
++int PQCLEAN_DILITHIUM3_AARCH64_crypto_sign_signature(
+     uint8_t *sig, size_t *siglen,
+     const uint8_t *m, size_t mlen, const uint8_t *sk);
+ 
+-int PQCLEAN_dilithium3_AARCH64_crypto_sign_verify(
++int PQCLEAN_DILITHIUM3_AARCH64_crypto_sign_verify(
+     const uint8_t *sig, size_t siglen,
+     const uint8_t *m, size_t mlen, const uint8_t *pk);
+ 
+-int PQCLEAN_dilithium3_AARCH64_crypto_sign(
++int PQCLEAN_DILITHIUM3_AARCH64_crypto_sign(
+     uint8_t *sm, size_t *smlen,
+     const uint8_t *m, size_t mlen, const uint8_t *sk);
+ 
+-int PQCLEAN_dilithium3_AARCH64_crypto_sign_open(
++int PQCLEAN_DILITHIUM3_AARCH64_crypto_sign_open(
+     uint8_t *m, size_t *mlen,
+     const uint8_t *sm, size_t smlen, const uint8_t *pk);
+ 
+diff --git b/crypto_sign/dilithium3/aarch64/feat.S a/crypto_sign/dilithium3/aarch64/feat.S
+index c0ccd712..358adf61 100644
+--- b/crypto_sign/dilithium3/aarch64/feat.S
++++ a/crypto_sign/dilithium3/aarch64/feat.S
+@@ -123,10 +123,10 @@ SOFTWARE.
+ .endm
+ 
+ .align 4
+-.global PQCLEAN_dilithium3_AARCH64_f1600x2
+-.global _PQCLEAN_dilithium3_AARCH64_f1600x2
+-PQCLEAN_dilithium3_AARCH64_f1600x2:
+-_PQCLEAN_dilithium3_AARCH64_f1600x2:
++.global PQCLEAN_DILITHIUM3_AARCH64_f1600x2
++.global _PQCLEAN_DILITHIUM3_AARCH64_f1600x2
++PQCLEAN_DILITHIUM3_AARCH64_f1600x2:
++_PQCLEAN_DILITHIUM3_AARCH64_f1600x2:
+     stp d8,  d9,  [sp,#-16]!
+     stp d10, d11, [sp,#-16]!
+     stp d12, d13, [sp,#-16]!
+diff --git b/crypto_sign/dilithium3/aarch64/fips202x2.c a/crypto_sign/dilithium3/aarch64/fips202x2.c
+index 0648aa82..e36a6788 100644
+--- b/crypto_sign/dilithium3/aarch64/fips202x2.c
++++ a/crypto_sign/dilithium3/aarch64/fips202x2.c
+@@ -101,12 +101,12 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = {
+ *
+ * Arguments:   - uint64_t *state: pointer to input/output Keccak state
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_f1600x2(v128*, const uint64_t*);
++extern void PQCLEAN_DILITHIUM3_AARCH64_f1600x2(v128*, const uint64_t*);
+ static inline
+ void KeccakF1600_StatePermutex2(v128 state[25])
+ {
+ #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+-  PQCLEAN_dilithium3_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
++  PQCLEAN_DILITHIUM3_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+ #else
+   v128 Aba, Abe, Abi, Abo, Abu;
+   v128 Aga, Age, Agi, Ago, Agu;
+diff --git b/crypto_sign/dilithium3/aarch64/ntt.h a/crypto_sign/dilithium3/aarch64/ntt.h
+index 3175fd96..2f16fac0 100644
+--- b/crypto_sign/dilithium3/aarch64/ntt.h
++++ a/crypto_sign/dilithium3/aarch64/ntt.h
+@@ -36,20 +36,20 @@
+ #include "params.h"
+ #include <stdint.h>
+ 
+-extern void PQCLEAN_dilithium3_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
+-extern void PQCLEAN_dilithium3_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
+ 
+-extern void PQCLEAN_dilithium3_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
+-extern void PQCLEAN_dilithium3_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
+ 
+ #define NTT(in) { \
+-        PQCLEAN_dilithium3_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+-        PQCLEAN_dilithium3_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+     }
+ 
+ #define iNTT(in) { \
+-        PQCLEAN_dilithium3_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+-        PQCLEAN_dilithium3_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+     }
+ 
+ #define ntt DILITHIUM_NAMESPACE(ntt)
+diff --git b/crypto_sign/dilithium3/aarch64/params.h a/crypto_sign/dilithium3/aarch64/params.h
+index 9d722c2a..0ee61152 100644
+--- b/crypto_sign/dilithium3/aarch64/params.h
++++ a/crypto_sign/dilithium3/aarch64/params.h
+@@ -12,8 +12,8 @@
+ #define DILITHIUM_MODE 3
+ //#define DILITHIUM_MODE 5
+ 
+-#define CRYPTO_NAMESPACETOP PQCLEAN_dilithium3_AARCH64_crypto_sign
+-#define CRYPTO_NAMESPACE(s) PQCLEAN_dilithium3_AARCH64_##s
++#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM3_AARCH64_crypto_sign
++#define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM3_AARCH64_##s
+ #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP
+ #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s)
+ 
+diff --git b/crypto_sign/dilithium3/aarch64/poly.c a/crypto_sign/dilithium3/aarch64/poly.c
+index 5a7fd240..f13f981e 100644
+--- b/crypto_sign/dilithium3/aarch64/poly.c
++++ a/crypto_sign/dilithium3/aarch64/poly.c
+@@ -57,11 +57,11 @@ static const int32_t montgomery_const[4] = {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_asm_poly_reduce(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(int32_t *, const int32_t *);
+ void poly_reduce(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium3_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -74,11 +74,11 @@ void poly_reduce(poly *a) {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_asm_poly_caddq(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(int32_t *, const int32_t *);
+ void poly_caddq(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium3_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -91,11 +91,11 @@ void poly_caddq(poly *a) {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_asm_poly_freeze(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(int32_t *, const int32_t *);
+ void poly_freeze(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium3_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -205,11 +205,11 @@ void poly_invntt_tomont(poly *a) {
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
+ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium3_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tmul);
+ }
+@@ -226,11 +226,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+ *              - poly *a0: pointer to output polynomial with coefficients c0
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
+ void poly_power2round(poly *a1, poly *a0, const poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium3_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
++    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
+ 
+     DBENCH_STOP(*tround);
+ }
+@@ -714,11 +714,11 @@ void polyt1_pack(uint8_t *r, const poly *a) {
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_asm_10_to_32(int32_t *, const uint8_t *);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(int32_t *, const uint8_t *);
+ void polyt1_unpack(poly *r, const uint8_t *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium3_AARCH64_asm_10_to_32(r->coeffs, a);
++    PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(r->coeffs, a);
+ 
+     DBENCH_STOP(*tpack);
+ }
+diff --git b/crypto_sign/dilithium3/aarch64/polyvec.c a/crypto_sign/dilithium3/aarch64/polyvec.c
+index 3a5e6acb..b73d210c 100644
+--- b/crypto_sign/dilithium3/aarch64/polyvec.c
++++ a/crypto_sign/dilithium3/aarch64/polyvec.c
+@@ -178,11 +178,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve
+ *              - const polyvecl *u: pointer to first input vector
+ *              - const polyvecl *v: pointer to second input vector
+ **************************************************/
+-extern void PQCLEAN_dilithium3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
+ void polyvecl_pointwise_acc_montgomery(poly *w,
+                                        const polyvecl *u,
+                                        const polyvecl *v) {
+-    PQCLEAN_dilithium3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
++    PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
+ }
+ 
+ /*************************************************
+diff --git b/crypto_sign/dilithium5/aarch64/__asm_NTT.S a/crypto_sign/dilithium5/aarch64/__asm_NTT.S
+index a98ae018..9cf61432 100644
+--- b/crypto_sign/dilithium5/aarch64/__asm_NTT.S
++++ a/crypto_sign/dilithium5/aarch64/__asm_NTT.S
+@@ -28,10 +28,10 @@
+ #include "macros.inc"
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_top
+-.global _PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_top
+-PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_top:
+-_PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_top:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top
++PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top:
+ 
+     push_all
+     Q         .req w20
+@@ -206,10 +206,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_top:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_bot
+-.global _PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_bot
+-PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_bot:
+-_PQCLEAN_dilithium5_AARCH64__asm_ntt_SIMD_bot:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot
++PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot:
+ 
+     push_all
+     Q         .req w20
+diff --git b/crypto_sign/dilithium5/aarch64/__asm_iNTT.S a/crypto_sign/dilithium5/aarch64/__asm_iNTT.S
+index 68179e9d..9daebaf6 100644
+--- b/crypto_sign/dilithium5/aarch64/__asm_iNTT.S
++++ a/crypto_sign/dilithium5/aarch64/__asm_iNTT.S
+@@ -28,10 +28,10 @@
+ #include "macros.inc"
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_top
+-.global _PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_top
+-PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_top:
+-_PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_top:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top
++PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top:
+ 
+     push_all
+     Q         .req w20
+@@ -435,10 +435,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_top:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_bot
+-.global _PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_bot
+-PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_bot:
+-_PQCLEAN_dilithium5_AARCH64__asm_intt_SIMD_bot:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot
++PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot:
+ 
+     push_all
+     Q         .req w20
+diff --git b/crypto_sign/dilithium5/aarch64/__asm_poly.S a/crypto_sign/dilithium5/aarch64/__asm_poly.S
+index 79e01e9d..edcc82b2 100644
+--- b/crypto_sign/dilithium5/aarch64/__asm_poly.S
++++ a/crypto_sign/dilithium5/aarch64/__asm_poly.S
+@@ -29,10 +29,10 @@
+ #include "params.h"
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_10_to_32
+-.global _PQCLEAN_dilithium5_AARCH64__asm_10_to_32
+-PQCLEAN_dilithium5_AARCH64__asm_10_to_32:
+-_PQCLEAN_dilithium5_AARCH64__asm_10_to_32:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32
++PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32:
+ 
+     mov x7, #16
+     _10_to_32_loop:
+@@ -102,10 +102,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_10_to_32:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_reduce
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_reduce
+-PQCLEAN_dilithium5_AARCH64__asm_poly_reduce:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_reduce:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce:
+ 
+     ldr w4, [x1]
+ 
+@@ -195,10 +195,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_reduce:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_caddq
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_caddq
+-PQCLEAN_dilithium5_AARCH64__asm_poly_caddq:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_caddq:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq:
+ 
+     ldr w4, [x1]
+ 
+@@ -288,10 +288,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_caddq:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_freeze
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_freeze
+-PQCLEAN_dilithium5_AARCH64__asm_poly_freeze:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_freeze:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze:
+ 
+     ldr w4, [x1]
+ 
+@@ -417,10 +417,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_freeze:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_power2round
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_power2round
+-PQCLEAN_dilithium5_AARCH64__asm_poly_power2round:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_power2round:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round:
+ 
+     mov w4, #1
+ 
+@@ -563,10 +563,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_power2round:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_add
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_add
+-PQCLEAN_dilithium5_AARCH64__asm_poly_add:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_add:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add:
+ 
+     ld1 {v0.4S}, [x1], #16
+     ld1 {v4.4S}, [x2], #16
+@@ -612,10 +612,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_add:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_sub
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_sub
+-PQCLEAN_dilithium5_AARCH64__asm_poly_sub:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_sub:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub:
+ 
+     ld1 {v0.4S}, [x1], #16
+     ld1 {v4.4S}, [x2], #16
+@@ -661,10 +661,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_sub:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_shiftl
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_shiftl
+-PQCLEAN_dilithium5_AARCH64__asm_poly_shiftl:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_shiftl:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl:
+ 
+     add x1, x0, #0
+ 
+@@ -728,10 +728,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_shiftl:
+     br lr
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_poly_pointwise_montgomery
+-.global _PQCLEAN_dilithium5_AARCH64__asm_poly_pointwise_montgomery
+-PQCLEAN_dilithium5_AARCH64__asm_poly_pointwise_montgomery:
+-_PQCLEAN_dilithium5_AARCH64__asm_poly_pointwise_montgomery:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery
++PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery:
+ 
+     push_all
+ 
+@@ -847,10 +847,10 @@ _PQCLEAN_dilithium5_AARCH64__asm_poly_pointwise_montgomery:
+ 
+ 
+ .align 2
+-.global PQCLEAN_dilithium5_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+-.global _PQCLEAN_dilithium5_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+-PQCLEAN_dilithium5_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+-_PQCLEAN_dilithium5_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
++.global PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery
++.global _PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery
++PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
++_PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+ 
+     push_all
+ 
+diff --git b/crypto_sign/dilithium5/aarch64/api.h a/crypto_sign/dilithium5/aarch64/api.h
+index 4c488918..ab5e2c41 100644
+--- b/crypto_sign/dilithium5/aarch64/api.h
++++ a/crypto_sign/dilithium5/aarch64/api.h
+@@ -5,34 +5,34 @@
+  * or public domain at https://github.com/pq-crystals/dilithium
+  */
+ 
+-#ifndef PQCLEAN_dilithium5_AARCH64_API_H
+-#define PQCLEAN_dilithium5_AARCH64_API_H
++#ifndef PQCLEAN_DILITHIUM5_AARCH64_API_H
++#define PQCLEAN_DILITHIUM5_AARCH64_API_H
+ 
+ 
+ #include <stddef.h>
+ #include <stdint.h>
+ 
+-#define PQCLEAN_dilithium5_AARCH64_CRYPTO_PUBLICKEYBYTES 2592
+-#define PQCLEAN_dilithium5_AARCH64_CRYPTO_SECRETKEYBYTES 4864
+-#define PQCLEAN_dilithium5_AARCH64_CRYPTO_BYTES 4595
+-#define PQCLEAN_dilithium5_AARCH64_CRYPTO_ALGNAME "Dilithium5"
++#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES 2592
++#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES 4864
++#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES 4595
++#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_ALGNAME "Dilithium5"
+ 
+ 
+-int PQCLEAN_dilithium5_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
++int PQCLEAN_DILITHIUM5_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+ 
+-int PQCLEAN_dilithium5_AARCH64_crypto_sign_signature(
++int PQCLEAN_DILITHIUM5_AARCH64_crypto_sign_signature(
+     uint8_t *sig, size_t *siglen,
+     const uint8_t *m, size_t mlen, const uint8_t *sk);
+ 
+-int PQCLEAN_dilithium5_AARCH64_crypto_sign_verify(
++int PQCLEAN_DILITHIUM5_AARCH64_crypto_sign_verify(
+     const uint8_t *sig, size_t siglen,
+     const uint8_t *m, size_t mlen, const uint8_t *pk);
+ 
+-int PQCLEAN_dilithium5_AARCH64_crypto_sign(
++int PQCLEAN_DILITHIUM5_AARCH64_crypto_sign(
+     uint8_t *sm, size_t *smlen,
+     const uint8_t *m, size_t mlen, const uint8_t *sk);
+ 
+-int PQCLEAN_dilithium5_AARCH64_crypto_sign_open(
++int PQCLEAN_DILITHIUM5_AARCH64_crypto_sign_open(
+     uint8_t *m, size_t *mlen,
+     const uint8_t *sm, size_t smlen, const uint8_t *pk);
+ 
+diff --git b/crypto_sign/dilithium5/aarch64/feat.S a/crypto_sign/dilithium5/aarch64/feat.S
+index fe2a90d5..01abc10a 100644
+--- b/crypto_sign/dilithium5/aarch64/feat.S
++++ a/crypto_sign/dilithium5/aarch64/feat.S
+@@ -123,10 +123,10 @@ SOFTWARE.
+ .endm
+ 
+ .align 4
+-.global PQCLEAN_dilithium5_AARCH64_f1600x2
+-.global _PQCLEAN_dilithium5_AARCH64_f1600x2
+-PQCLEAN_dilithium5_AARCH64_f1600x2:
+-_PQCLEAN_dilithium5_AARCH64_f1600x2:
++.global PQCLEAN_DILITHIUM5_AARCH64_f1600x2
++.global _PQCLEAN_DILITHIUM5_AARCH64_f1600x2
++PQCLEAN_DILITHIUM5_AARCH64_f1600x2:
++_PQCLEAN_DILITHIUM5_AARCH64_f1600x2:
+     stp d8,  d9,  [sp,#-16]!
+     stp d10, d11, [sp,#-16]!
+     stp d12, d13, [sp,#-16]!
+diff --git b/crypto_sign/dilithium5/aarch64/fips202x2.c a/crypto_sign/dilithium5/aarch64/fips202x2.c
+index 2573e2c8..259b199f 100644
+--- b/crypto_sign/dilithium5/aarch64/fips202x2.c
++++ a/crypto_sign/dilithium5/aarch64/fips202x2.c
+@@ -101,12 +101,12 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = {
+ *
+ * Arguments:   - uint64_t *state: pointer to input/output Keccak state
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_f1600x2(v128*, const uint64_t*);
++extern void PQCLEAN_DILITHIUM5_AARCH64_f1600x2(v128*, const uint64_t*);
+ static inline
+ void KeccakF1600_StatePermutex2(v128 state[25])
+ {
+ #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+-  PQCLEAN_dilithium5_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
++  PQCLEAN_DILITHIUM5_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+ #else
+   v128 Aba, Abe, Abi, Abo, Abu;
+   v128 Aga, Age, Agi, Ago, Agu;
+diff --git b/crypto_sign/dilithium5/aarch64/ntt.h a/crypto_sign/dilithium5/aarch64/ntt.h
+index 6e593d84..e6f51189 100644
+--- b/crypto_sign/dilithium5/aarch64/ntt.h
++++ a/crypto_sign/dilithium5/aarch64/ntt.h
+@@ -36,20 +36,20 @@
+ #include "params.h"
+ #include <stdint.h>
+ 
+-extern void PQCLEAN_dilithium5_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
+-extern void PQCLEAN_dilithium5_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
+ 
+-extern void PQCLEAN_dilithium5_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
+-extern void PQCLEAN_dilithium5_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
+ 
+ #define NTT(in) { \
+-        PQCLEAN_dilithium5_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+-        PQCLEAN_dilithium5_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+     }
+ 
+ #define iNTT(in) { \
+-        PQCLEAN_dilithium5_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+-        PQCLEAN_dilithium5_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
++        PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+     }
+ 
+ #define ntt DILITHIUM_NAMESPACE(ntt)
+diff --git b/crypto_sign/dilithium5/aarch64/params.h a/crypto_sign/dilithium5/aarch64/params.h
+index 589b05b8..a967fd4a 100644
+--- b/crypto_sign/dilithium5/aarch64/params.h
++++ a/crypto_sign/dilithium5/aarch64/params.h
+@@ -12,8 +12,8 @@
+ //#define DILITHIUM_MODE 3
+ #define DILITHIUM_MODE 5
+ 
+-#define CRYPTO_NAMESPACETOP PQCLEAN_dilithium5_AARCH64_crypto_sign
+-#define CRYPTO_NAMESPACE(s) PQCLEAN_dilithium5_AARCH64_##s
++#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM5_AARCH64_crypto_sign
++#define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM5_AARCH64_##s
+ #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP
+ #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s)
+ 
+diff --git b/crypto_sign/dilithium5/aarch64/poly.c a/crypto_sign/dilithium5/aarch64/poly.c
+index 9b22c9c9..788bb14b 100644
+--- b/crypto_sign/dilithium5/aarch64/poly.c
++++ a/crypto_sign/dilithium5/aarch64/poly.c
+@@ -57,11 +57,11 @@ static const int32_t montgomery_const[4] = {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_asm_poly_reduce(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(int32_t *, const int32_t *);
+ void poly_reduce(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium5_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -74,11 +74,11 @@ void poly_reduce(poly *a) {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_asm_poly_caddq(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(int32_t *, const int32_t *);
+ void poly_caddq(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium5_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -91,11 +91,11 @@ void poly_caddq(poly *a) {
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_asm_poly_freeze(int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(int32_t *, const int32_t *);
+ void poly_freeze(poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium5_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(a->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tred);
+ }
+@@ -205,11 +205,11 @@ void poly_invntt_tomont(poly *a) {
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
+ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium5_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
++    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
+ 
+     DBENCH_STOP(*tmul);
+ }
+@@ -226,11 +226,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+ *              - poly *a0: pointer to output polynomial with coefficients c0
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
+ void poly_power2round(poly *a1, poly *a0, const poly *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium5_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
++    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
+ 
+     DBENCH_STOP(*tround);
+ }
+@@ -738,11 +738,11 @@ void polyt1_pack(uint8_t *r, const poly *a) {
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_asm_10_to_32(int32_t *, const uint8_t *);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(int32_t *, const uint8_t *);
+ void polyt1_unpack(poly *r, const uint8_t *a) {
+     DBENCH_START();
+ 
+-    PQCLEAN_dilithium5_AARCH64_asm_10_to_32(r->coeffs, a);
++    PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(r->coeffs, a);
+ 
+     DBENCH_STOP(*tpack);
+ }
+diff --git b/crypto_sign/dilithium5/aarch64/polyvec.c a/crypto_sign/dilithium5/aarch64/polyvec.c
+index 262543ca..4496aaf0 100644
+--- b/crypto_sign/dilithium5/aarch64/polyvec.c
++++ a/crypto_sign/dilithium5/aarch64/polyvec.c
+@@ -178,11 +178,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve
+ *              - const polyvecl *u: pointer to first input vector
+ *              - const polyvecl *v: pointer to second input vector
+ **************************************************/
+-extern void PQCLEAN_dilithium5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
++extern void PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
+ void polyvecl_pointwise_acc_montgomery(poly *w,
+                                        const polyvecl *u,
+                                        const polyvecl *v) {
+-    PQCLEAN_dilithium5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
++    PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
+ }
+ 
+ /*************************************************
diff --git a/scripts/copy_from_upstream/patches/pqclean-kyber-arm-macos-gcc-fix.patch b/scripts/copy_from_upstream/patches/pqclean-kyber-arm-macos-gcc-fix.patch
deleted file mode 100644
index b61bf78..0000000
--- a/scripts/copy_from_upstream/patches/pqclean-kyber-arm-macos-gcc-fix.patch
+++ /dev/null
@@ -1,318 +0,0 @@
-diff --git a/crypto_kem/kyber1024/aarch64/__asm_NTT.S b/crypto_kem/kyber1024/aarch64/__asm_NTT.S
-index 7f376ec..bf693c5 100644
---- a/crypto_kem/kyber1024/aarch64/__asm_NTT.S
-+++ b/crypto_kem/kyber1024/aarch64/__asm_NTT.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top
- .global _PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top:
-@@ -173,7 +173,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top:
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot
- .global _PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot:
-diff --git a/crypto_kem/kyber1024/aarch64/__asm_base_mul.S b/crypto_kem/kyber1024/aarch64/__asm_base_mul.S
-index 94ba36b..3eed305 100644
---- a/crypto_kem/kyber1024/aarch64/__asm_base_mul.S
-+++ b/crypto_kem/kyber1024/aarch64/__asm_base_mul.S
-@@ -6,7 +6,7 @@
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended
- .global _PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
-   .type PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended:
-@@ -73,7 +73,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended:
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul
- .global _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul:
-@@ -228,7 +228,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul:
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery
- .global _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery:
-diff --git a/crypto_kem/kyber1024/aarch64/__asm_iNTT.S b/crypto_kem/kyber1024/aarch64/__asm_iNTT.S
-index 5701058..f7e83ab 100644
---- a/crypto_kem/kyber1024/aarch64/__asm_iNTT.S
-+++ b/crypto_kem/kyber1024/aarch64/__asm_iNTT.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot
- .global _PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot:
-@@ -90,7 +90,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot:
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top
- .global _PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top:
-diff --git a/crypto_kem/kyber1024/aarch64/__asm_poly.S b/crypto_kem/kyber1024/aarch64/__asm_poly.S
-index 0be0163..34a38bb 100644
---- a/crypto_kem/kyber1024/aarch64/__asm_poly.S
-+++ b/crypto_kem/kyber1024/aarch64/__asm_poly.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_add_reduce
- .global _PQCLEAN_KYBER1024_AARCH64_asm_add_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_add_reduce, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_add_reduce:
-@@ -68,7 +68,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_add_reduce:
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce
- .global _PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce:
-@@ -132,7 +132,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce:
- .align 2
- .global PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce
- .global _PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce, %function
- #endif
- PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce:
-diff --git a/crypto_kem/kyber512/aarch64/__asm_NTT.S b/crypto_kem/kyber512/aarch64/__asm_NTT.S
-index 1abbca1..ada533e 100644
---- a/crypto_kem/kyber512/aarch64/__asm_NTT.S
-+++ b/crypto_kem/kyber512/aarch64/__asm_NTT.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top
- .global _PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top:
-@@ -173,7 +173,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top:
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot
- .global _PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot:
-diff --git a/crypto_kem/kyber512/aarch64/__asm_base_mul.S b/crypto_kem/kyber512/aarch64/__asm_base_mul.S
-index 185ac20..a3d39f1 100644
---- a/crypto_kem/kyber512/aarch64/__asm_base_mul.S
-+++ b/crypto_kem/kyber512/aarch64/__asm_base_mul.S
-@@ -6,7 +6,7 @@
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended
- .global _PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
-   .type PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended:
-@@ -73,7 +73,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended:
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul
- .global _PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul:
-@@ -228,7 +228,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul:
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery
- .global _PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery:
-diff --git a/crypto_kem/kyber512/aarch64/__asm_iNTT.S b/crypto_kem/kyber512/aarch64/__asm_iNTT.S
-index c83694c..4a135e7 100644
---- a/crypto_kem/kyber512/aarch64/__asm_iNTT.S
-+++ b/crypto_kem/kyber512/aarch64/__asm_iNTT.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot
- .global _PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot:
-@@ -90,7 +90,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot:
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top
- .global _PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top:
-diff --git a/crypto_kem/kyber512/aarch64/__asm_poly.S b/crypto_kem/kyber512/aarch64/__asm_poly.S
-index 1be60ad..fa77b95 100644
---- a/crypto_kem/kyber512/aarch64/__asm_poly.S
-+++ b/crypto_kem/kyber512/aarch64/__asm_poly.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_add_reduce
- .global _PQCLEAN_KYBER512_AARCH64_asm_add_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_add_reduce, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_add_reduce:
-@@ -68,7 +68,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_add_reduce:
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_sub_reduce
- .global _PQCLEAN_KYBER512_AARCH64_asm_sub_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_sub_reduce, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_sub_reduce:
-@@ -132,7 +132,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_sub_reduce:
- .align 2
- .global PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce
- .global _PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce, %function
- #endif
- PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce:
-diff --git a/crypto_kem/kyber768/aarch64/__asm_NTT.S b/crypto_kem/kyber768/aarch64/__asm_NTT.S
-index 19aa03c..bb2253e 100644
---- a/crypto_kem/kyber768/aarch64/__asm_NTT.S
-+++ b/crypto_kem/kyber768/aarch64/__asm_NTT.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top
- .global _PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top:
-@@ -173,7 +173,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top:
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot
- .global _PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot:
-diff --git a/crypto_kem/kyber768/aarch64/__asm_base_mul.S b/crypto_kem/kyber768/aarch64/__asm_base_mul.S
-index 1e9fb26..2bbb228 100644
---- a/crypto_kem/kyber768/aarch64/__asm_base_mul.S
-+++ b/crypto_kem/kyber768/aarch64/__asm_base_mul.S
-@@ -6,7 +6,7 @@
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended
- .global _PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
-   .type PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended:
-@@ -73,7 +73,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended:
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul
- .global _PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul:
-@@ -228,7 +228,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul:
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery
- .global _PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery:
-diff --git a/crypto_kem/kyber768/aarch64/__asm_iNTT.S b/crypto_kem/kyber768/aarch64/__asm_iNTT.S
-index e6fcc40..cce9aa7 100644
---- a/crypto_kem/kyber768/aarch64/__asm_iNTT.S
-+++ b/crypto_kem/kyber768/aarch64/__asm_iNTT.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot
- .global _PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot:
-@@ -90,7 +90,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot:
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top
- .global _PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top:
-diff --git a/crypto_kem/kyber768/aarch64/__asm_poly.S b/crypto_kem/kyber768/aarch64/__asm_poly.S
-index 0063959..9d7816c 100644
---- a/crypto_kem/kyber768/aarch64/__asm_poly.S
-+++ b/crypto_kem/kyber768/aarch64/__asm_poly.S
-@@ -4,7 +4,7 @@
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_add_reduce
- .global _PQCLEAN_KYBER768_AARCH64_asm_add_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_add_reduce, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_add_reduce:
-@@ -68,7 +68,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_add_reduce:
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_sub_reduce
- .global _PQCLEAN_KYBER768_AARCH64_asm_sub_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_sub_reduce, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_sub_reduce:
-@@ -132,7 +132,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_sub_reduce:
- .align 2
- .global PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce
- .global _PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce
--#ifndef __clang__
-+#if !defined(__clang__) && !defined(old_gas_syntax)
- .type PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce, %function
- #endif
- PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce:
diff --git a/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-768-1024-fixes.patch b/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-768-1024-fixes.patch
new file mode 100644
index 0000000..8f16637
--- /dev/null
+++ b/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-768-1024-fixes.patch
@@ -0,0 +1,283 @@
+diff --git a/crypto_kem/kyber1024/aarch64/indcpa.c b/crypto_kem/kyber1024/aarch64/indcpa.c
+index 6b83943e..43f489f0 100644
+--- a/crypto_kem/kyber1024/aarch64/indcpa.c
++++ b/crypto_kem/kyber1024/aarch64/indcpa.c
+@@ -160,39 +160,44 @@ static void unpack_ciphertext(int16_t b[KYBER_K][KYBER_N], int16_t *v, const uin
+ **************************************************/
+ #define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
+ // Not static for benchmarking
+-void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed) {
+-    unsigned int ctr0, ctr1, k;
+-    unsigned int buflen, off;
+-    uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
+-            buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+-    neon_xof_state state;
+-
+-    for (unsigned int i = 0; i < KYBER_K; i++) {
+-        if (transposed) {
+-            neon_xof_absorb(&state, seed, i, i, 0, 1);
+-        } else {
+-            neon_xof_absorb(&state, seed, 0, 1, i, i);
++void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed)
++{
++  unsigned int ctr0, ctr1, k;
++  unsigned int buflen, off;
++  uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
++      buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
++  neon_xof_state state;
++
++  for (unsigned int i = 0; i < KYBER_K; i++)
++  {
++    for (unsigned int j = 0; j < KYBER_K; j += 2)
++    {
++      if (transposed)
++        neon_xof_absorb(&state, seed, i, i, j, j + 1);
++      else
++        neon_xof_absorb(&state, seed, j, j + 1, i, i);
++
++      neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
++      buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
++      ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0);
++      ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1);
++
++      while (ctr0 < KYBER_N || ctr1 < KYBER_N)
++      {
++        off = buflen % 3;
++        for (k = 0; k < off; k++)
++        {
++          buf0[k] = buf0[buflen - off + k];
++          buf1[k] = buf1[buflen - off + k];
+         }
++        neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
+ 
+-        neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
+-
+-        buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+-
+-        ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0);
+-        ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1);
+-        while (ctr0 < KYBER_N || ctr1 < KYBER_N) {
+-            off = buflen % 3;
+-            for (k = 0; k < off; k++) {
+-                buf0[k] = buf0[buflen - off + k];
+-                buf1[k] = buf1[buflen - off + k];
+-            }
+-            neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
+-
+-            buflen = off + XOF_BLOCKBYTES;
+-            ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
+-            ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
+-        }
++        buflen = off + XOF_BLOCKBYTES;
++        ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
++        ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
++      }
+     }
++  }
+ }
+ 
+ /*************************************************
+@@ -224,7 +229,9 @@ void indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
+     gen_a(a, publicseed);
+ 
+     neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1);
+-    neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3);
++    neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3);
++    neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5);
++    neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7);
+ 
+     neon_polyvec_ntt(skpv);
+     neon_polyvec_ntt(e);
+@@ -280,10 +287,11 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
+     poly_frommsg(k, m);
+     gen_at(at, seed);
+ 
+-  // ETA1 != ETA2 (3 != 2)
+-  neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
+-  neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3);
+-  neon_poly_getnoise_eta2(&(epp[0]), coins, 4);
++    neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
++    neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3);
++    neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5);
++    neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7);
++    neon_poly_getnoise_eta2(&(epp[0]), coins, 8);
+ 
+     neon_polyvec_ntt(sp);
+ 
+diff --git a/crypto_kem/kyber768/aarch64/indcpa.c b/crypto_kem/kyber768/aarch64/indcpa.c
+index 02448809..ff24f150 100644
+--- a/crypto_kem/kyber768/aarch64/indcpa.c
++++ b/crypto_kem/kyber768/aarch64/indcpa.c
+@@ -160,39 +160,114 @@ static void unpack_ciphertext(int16_t b[KYBER_K][KYBER_N], int16_t *v, const uin
+ **************************************************/
+ #define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
+ // Not static for benchmarking
+-void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed) {
+-    unsigned int ctr0, ctr1, k;
+-    unsigned int buflen, off;
+-    uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
+-            buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+-    neon_xof_state state;
+-
+-    for (unsigned int i = 0; i < KYBER_K; i++) {
+-        if (transposed) {
+-            neon_xof_absorb(&state, seed, i, i, 0, 1);
+-        } else {
+-            neon_xof_absorb(&state, seed, 0, 1, i, i);
+-        }
+-
+-        neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
+-
+-        buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+-
+-        ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0);
+-        ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1);
+-        while (ctr0 < KYBER_N || ctr1 < KYBER_N) {
+-            off = buflen % 3;
+-            for (k = 0; k < off; k++) {
+-                buf0[k] = buf0[buflen - off + k];
+-                buf1[k] = buf1[buflen - off + k];
+-            }
+-            neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
+-
+-            buflen = off + XOF_BLOCKBYTES;
+-            ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
+-            ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
+-        }
++void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed)
++{
++  unsigned int ctr0, ctr1, k;
++  unsigned int buflen, off;
++  uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
++      buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
++  neon_xof_state state;
++
++  int16_t *s1 = NULL, *s2 = NULL;
++  unsigned int x1, x2, y1, y2;
++  xof_state c_state;
++  shake128_inc_init(&c_state); // patch
++
++  for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2)
++  {
++    switch (j)
++    {
++    case 0:
++      s1 = &(a[0][0][0]);
++      s2 = &(a[0][1][0]);
++      x1 = 0;
++      y1 = 0;
++      x2 = 0;
++      y2 = 1;
++      break;
++    case 2:
++      s1 = &(a[0][2][0]);
++      s2 = &(a[1][0][0]);
++      x1 = 0;
++      y1 = 2;
++      x2 = 1;
++      y2 = 0;
++      break;
++    case 4:
++      s1 = &(a[1][1][0]);
++      s2 = &(a[1][2][0]);
++      x1 = 1;
++      y1 = 1;
++      x2 = 1;
++      y2 = 2;
++      break;
++    default:
++      s1 = &(a[2][0][0]);
++      s2 = &(a[2][1][0]);
++      x1 = 2;
++      y1 = 0;
++      x2 = 2;
++      y2 = 1;
++      break;
+     }
++
++    if (transposed)
++      neon_xof_absorb(&state, seed, x1, x2, y1, y2);
++    else
++      neon_xof_absorb(&state, seed, y1, y2, x1, x2);
++
++    neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
++
++    buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
++
++    ctr0 = neon_rej_uniform(s1, buf0);
++    ctr1 = neon_rej_uniform(s2, buf1);
++
++    while (ctr0 < KYBER_N || ctr1 < KYBER_N)
++    {
++      off = buflen % 3;
++      for (k = 0; k < off; k++)
++      {
++        buf0[k] = buf0[buflen - off + k];
++        buf1[k] = buf1[buflen - off + k];
++      }
++      neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
++
++      buflen = off + XOF_BLOCKBYTES;
++      ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen);
++      ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen);
++    }
++  }
++
++  // Last iteration [2][2]
++  if (transposed){
++    xof_absorb(&c_state, seed, 2, 2);
++  }
++  else{
++    xof_absorb(&c_state, seed, 2, 2);
++  }
++
++  xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state);
++
++  buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
++
++  ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0);
++
++  while (ctr0 < KYBER_N)
++  {
++    off = buflen % 3;
++    for (k = 0; k < off; k++)
++    {
++      buf0[k] = buf0[buflen - off + k];
++    }
++    xof_squeezeblocks(buf0 + off, 1, &c_state);
++
++    buflen = off + XOF_BLOCKBYTES;
++    ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
++  }
++
++  shake128_inc_ctx_release(&c_state);
++
+ }
+ 
+ /*************************************************
+@@ -224,7 +299,8 @@ void indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
+     gen_a(a, publicseed);
+ 
+     neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1);
+-    neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3);
++    neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3);
++    neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5);
+ 
+     neon_polyvec_ntt(skpv);
+     neon_polyvec_ntt(e);
+@@ -280,10 +356,11 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
+     poly_frommsg(k, m);
+     gen_at(at, seed);
+ 
+-  // ETA1 != ETA2 (3 != 2)
+-  neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
+-  neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3);
+-  neon_poly_getnoise_eta2(&(epp[0]), coins, 4);
++    // Because ETA1 == ETA2
++    neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
++    neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3);
++    neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5);
++    neon_poly_getnoise_eta2(&(epp[0]), coins, 6);
+ 
+     neon_polyvec_ntt(sp);
+ 
diff --git a/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-shake-fixes.patch b/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-shake-fixes.patch
new file mode 100644
index 0000000..5efaeb1
--- /dev/null
+++ b/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-shake-fixes.patch
@@ -0,0 +1,285 @@
+diff --git b/crypto_kem/kyber1024/aarch64/fips202x2.h a/crypto_kem/kyber1024/aarch64/fips202x2.h
+index a9f8f7c4..a1eacdf9 100644
+--- b/crypto_kem/kyber1024/aarch64/fips202x2.h
++++ a/crypto_kem/kyber1024/aarch64/fips202x2.h
+@@ -12,15 +12,10 @@
+ #include "params.h"
+ #include <arm_neon.h>
+ #include <stddef.h>
++#include "fips202.h"
+ 
+ typedef uint64x2_t v128;
+ 
+-#define SHAKE128_RATE 168
+-#define SHAKE256_RATE 136
+-#define SHA3_256_RATE 136
+-#define SHA3_512_RATE 72
+-
+-
+ typedef struct {
+     v128 s[25];
+ } keccakx2_state;
+diff --git b/crypto_kem/kyber1024/aarch64/neon_poly.c a/crypto_kem/kyber1024/aarch64/neon_poly.c
+index 0de98583..6d787dde 100644
+--- b/crypto_kem/kyber1024/aarch64/neon_poly.c
++++ a/crypto_kem/kyber1024/aarch64/neon_poly.c
+@@ -131,14 +131,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) {
+ *            - const poly *a: pointer to first input polynomial
+ *            - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(int16_t *, const int16_t *);
+ void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
+-    PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(c, a);
++    PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(c, a);
+ }
+ 
+-extern void PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
+ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) {
+-    PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(c, a, b);
++    PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(c, a, b);
+ }
+ 
+ /*************************************************
+@@ -152,7 +152,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons
+ *            - const poly *a: pointer to first input polynomial
+ *            - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(int16_t *, const int16_t *);
+ void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
+-    PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(c, a);
++    PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(c, a);
+ }
+diff --git b/crypto_kem/kyber1024/aarch64/symmetric-shake.c a/crypto_kem/kyber1024/aarch64/symmetric-shake.c
+index bbc0f2c6..e7e7e874 100644
+--- b/crypto_kem/kyber1024/aarch64/symmetric-shake.c
++++ a/crypto_kem/kyber1024/aarch64/symmetric-shake.c
+@@ -22,7 +22,7 @@
+ *              - uint8_t i: additional byte of input
+ *              - uint8_t j: additional byte of input
+ **************************************************/
+-void kyber_shake128_absorb(shake128ctx *state,
++void kyber_shake128_absorb(shake128incctx *state,
+                            const uint8_t seed[KYBER_SYMBYTES],
+                            uint8_t x,
+                            uint8_t y) {
+@@ -32,7 +32,7 @@ void kyber_shake128_absorb(shake128ctx *state,
+     extseed[KYBER_SYMBYTES + 0] = x;
+     extseed[KYBER_SYMBYTES + 1] = y;
+ 
+-    shake128_absorb(state, extseed, sizeof(extseed));
++    shake128_absorb_once(state, extseed, sizeof(extseed));
+ }
+ 
+ /*************************************************
+diff --git b/crypto_kem/kyber1024/aarch64/symmetric.h a/crypto_kem/kyber1024/aarch64/symmetric.h
+index d4973b8b..12f6a5cf 100644
+--- b/crypto_kem/kyber1024/aarch64/symmetric.h
++++ a/crypto_kem/kyber1024/aarch64/symmetric.h
+@@ -16,12 +16,12 @@
+ 
+ #include "fips202.h"
+ 
+-typedef shake128ctx xof_state;
++typedef shake128incctx xof_state;
+ 
+ 
+ 
+ #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
+-void kyber_shake128_absorb(shake128ctx *s,
++void kyber_shake128_absorb(shake128incctx *s,
+                            const uint8_t seed[KYBER_SYMBYTES],
+                            uint8_t x,
+                            uint8_t y);
+diff --git b/crypto_kem/kyber512/aarch64/fips202x2.h a/crypto_kem/kyber512/aarch64/fips202x2.h
+index a9f8f7c4..a1eacdf9 100644
+--- b/crypto_kem/kyber512/aarch64/fips202x2.h
++++ a/crypto_kem/kyber512/aarch64/fips202x2.h
+@@ -12,15 +12,10 @@
+ #include "params.h"
+ #include <arm_neon.h>
+ #include <stddef.h>
++#include "fips202.h"
+ 
+ typedef uint64x2_t v128;
+ 
+-#define SHAKE128_RATE 168
+-#define SHAKE256_RATE 136
+-#define SHA3_256_RATE 136
+-#define SHA3_512_RATE 72
+-
+-
+ typedef struct {
+     v128 s[25];
+ } keccakx2_state;
+diff --git b/crypto_kem/kyber512/aarch64/neon_poly.c a/crypto_kem/kyber512/aarch64/neon_poly.c
+index 8ea6ba4f..fdb37f9d 100644
+--- b/crypto_kem/kyber512/aarch64/neon_poly.c
++++ a/crypto_kem/kyber512/aarch64/neon_poly.c
+@@ -131,14 +131,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) {
+ *            - const poly *a: pointer to first input polynomial
+ *            - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_KYBER512_AARCH64_asm_add_reduce(int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER512_AARCH64__asm_add_reduce(int16_t *, const int16_t *);
+ void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
+-    PQCLEAN_KYBER512_AARCH64_asm_add_reduce(c, a);
++    PQCLEAN_KYBER512_AARCH64__asm_add_reduce(c, a);
+ }
+ 
+-extern void PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
+ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) {
+-    PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(c, a, b);
++    PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(c, a, b);
+ }
+ 
+ /*************************************************
+@@ -152,7 +152,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons
+ *            - const poly *a: pointer to first input polynomial
+ *            - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(int16_t *, const int16_t *);
+ void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
+-    PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(c, a);
++    PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(c, a);
+ }
+diff --git b/crypto_kem/kyber512/aarch64/symmetric-shake.c a/crypto_kem/kyber512/aarch64/symmetric-shake.c
+index bbc0f2c6..e7e7e874 100644
+--- b/crypto_kem/kyber512/aarch64/symmetric-shake.c
++++ a/crypto_kem/kyber512/aarch64/symmetric-shake.c
+@@ -22,7 +22,7 @@
+ *              - uint8_t i: additional byte of input
+ *              - uint8_t j: additional byte of input
+ **************************************************/
+-void kyber_shake128_absorb(shake128ctx *state,
++void kyber_shake128_absorb(shake128incctx *state,
+                            const uint8_t seed[KYBER_SYMBYTES],
+                            uint8_t x,
+                            uint8_t y) {
+@@ -32,7 +32,7 @@ void kyber_shake128_absorb(shake128ctx *state,
+     extseed[KYBER_SYMBYTES + 0] = x;
+     extseed[KYBER_SYMBYTES + 1] = y;
+ 
+-    shake128_absorb(state, extseed, sizeof(extseed));
++    shake128_absorb_once(state, extseed, sizeof(extseed));
+ }
+ 
+ /*************************************************
+diff --git b/crypto_kem/kyber512/aarch64/symmetric.h a/crypto_kem/kyber512/aarch64/symmetric.h
+index d4973b8b..12f6a5cf 100644
+--- b/crypto_kem/kyber512/aarch64/symmetric.h
++++ a/crypto_kem/kyber512/aarch64/symmetric.h
+@@ -16,12 +16,12 @@
+ 
+ #include "fips202.h"
+ 
+-typedef shake128ctx xof_state;
++typedef shake128incctx xof_state;
+ 
+ 
+ 
+ #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
+-void kyber_shake128_absorb(shake128ctx *s,
++void kyber_shake128_absorb(shake128incctx *s,
+                            const uint8_t seed[KYBER_SYMBYTES],
+                            uint8_t x,
+                            uint8_t y);
+diff --git b/crypto_kem/kyber768/aarch64/fips202x2.h a/crypto_kem/kyber768/aarch64/fips202x2.h
+index a9f8f7c4..a1eacdf9 100644
+--- b/crypto_kem/kyber768/aarch64/fips202x2.h
++++ a/crypto_kem/kyber768/aarch64/fips202x2.h
+@@ -12,15 +12,10 @@
+ #include "params.h"
+ #include <arm_neon.h>
+ #include <stddef.h>
++#include "fips202.h"
+ 
+ typedef uint64x2_t v128;
+ 
+-#define SHAKE128_RATE 168
+-#define SHAKE256_RATE 136
+-#define SHA3_256_RATE 136
+-#define SHA3_512_RATE 72
+-
+-
+ typedef struct {
+     v128 s[25];
+ } keccakx2_state;
+diff --git b/crypto_kem/kyber768/aarch64/neon_poly.c a/crypto_kem/kyber768/aarch64/neon_poly.c
+index 70d31c3c..cd6ce6e9 100644
+--- b/crypto_kem/kyber768/aarch64/neon_poly.c
++++ a/crypto_kem/kyber768/aarch64/neon_poly.c
+@@ -131,14 +131,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) {
+ *            - const poly *a: pointer to first input polynomial
+ *            - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_KYBER768_AARCH64_asm_add_reduce(int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER768_AARCH64__asm_add_reduce(int16_t *, const int16_t *);
+ void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
+-    PQCLEAN_KYBER768_AARCH64_asm_add_reduce(c, a);
++    PQCLEAN_KYBER768_AARCH64__asm_add_reduce(c, a);
+ }
+ 
+-extern void PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
+ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) {
+-    PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(c, a, b);
++    PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(c, a, b);
+ }
+ 
+ /*************************************************
+@@ -152,7 +152,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons
+ *            - const poly *a: pointer to first input polynomial
+ *            - const poly *b: pointer to second input polynomial
+ **************************************************/
+-extern void PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(int16_t *, const int16_t *);
++extern void PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(int16_t *, const int16_t *);
+ void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
+-    PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(c, a);
++    PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(c, a);
+ }
+diff --git b/crypto_kem/kyber768/aarch64/symmetric-shake.c a/crypto_kem/kyber768/aarch64/symmetric-shake.c
+index bbc0f2c6..e7e7e874 100644
+--- b/crypto_kem/kyber768/aarch64/symmetric-shake.c
++++ a/crypto_kem/kyber768/aarch64/symmetric-shake.c
+@@ -22,7 +22,7 @@
+ *              - uint8_t i: additional byte of input
+ *              - uint8_t j: additional byte of input
+ **************************************************/
+-void kyber_shake128_absorb(shake128ctx *state,
++void kyber_shake128_absorb(shake128incctx *state,
+                            const uint8_t seed[KYBER_SYMBYTES],
+                            uint8_t x,
+                            uint8_t y) {
+@@ -32,7 +32,7 @@ void kyber_shake128_absorb(shake128ctx *state,
+     extseed[KYBER_SYMBYTES + 0] = x;
+     extseed[KYBER_SYMBYTES + 1] = y;
+ 
+-    shake128_absorb(state, extseed, sizeof(extseed));
++    shake128_absorb_once(state, extseed, sizeof(extseed));
+ }
+ 
+ /*************************************************
+diff --git b/crypto_kem/kyber768/aarch64/symmetric.h a/crypto_kem/kyber768/aarch64/symmetric.h
+index d4973b8b..12f6a5cf 100644
+--- b/crypto_kem/kyber768/aarch64/symmetric.h
++++ a/crypto_kem/kyber768/aarch64/symmetric.h
+@@ -16,12 +16,12 @@
+ 
+ #include "fips202.h"
+ 
+-typedef shake128ctx xof_state;
++typedef shake128incctx xof_state;
+ 
+ 
+ 
+ #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
+-void kyber_shake128_absorb(shake128ctx *s,
++void kyber_shake128_absorb(shake128incctx *s,
+                            const uint8_t seed[KYBER_SYMBYTES],
+                            uint8_t x,
+                            uint8_t y);
diff --git a/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-shake.patch b/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-shake.patch
deleted file mode 100644
index 4d77884..0000000
--- a/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-shake.patch
+++ /dev/null
@@ -1,232 +0,0 @@
-From 3a86145c7ec6ba72713c422bc40d686626405657 Mon Sep 17 00:00:00 2001
-From: Martyrshot <Martyrshot@gmail.com>
-Date: Sun, 31 Oct 2021 18:23:51 +0000
-Subject: [PATCH] shimming in pqclean's aarch64 optimized kyber implementations
-
----
- crypto_kem/kyber1024/aarch64/fips202x2.h       | 8 ++------
- crypto_kem/kyber1024/aarch64/symmetric-shake.c | 4 ++--
- crypto_kem/kyber1024/aarch64/symmetric.h       | 4 ++--
- crypto_kem/kyber512/aarch64/fips202x2.h        | 7 ++-----
- crypto_kem/kyber512/aarch64/symmetric-shake.c  | 4 ++--
- crypto_kem/kyber512/aarch64/symmetric.h        | 4 ++--
- crypto_kem/kyber768/aarch64/fips202x2.h        | 8 ++------
- crypto_kem/kyber768/aarch64/indcpa.c           | 3 ++-
- crypto_kem/kyber768/aarch64/symmetric-shake.c  | 4 ++--
- crypto_kem/kyber768/aarch64/symmetric.h        | 5 +++--
- 10 files changed, 21 insertions(+), 30 deletions(-)
-
-diff --git a/crypto_kem/kyber1024/aarch64/fips202x2.h b/crypto_kem/kyber1024/aarch64/fips202x2.h
-index 3b9dd97..7cffd7b 100644
---- a/crypto_kem/kyber1024/aarch64/fips202x2.h
-+++ b/crypto_kem/kyber1024/aarch64/fips202x2.h
-@@ -5,13 +5,9 @@
- #include <arm_neon.h>
- #include <stddef.h>
- 
--typedef uint64x2_t v128;
--
--#define SHAKE128_RATE 168
--#define SHAKE256_RATE 136
--#define SHA3_256_RATE 136
--#define SHA3_512_RATE 72
-+#include "fips202.h"
- 
-+typedef uint64x2_t v128;
- 
- typedef struct {
-     v128 s[25];
-diff --git a/crypto_kem/kyber1024/aarch64/symmetric-shake.c b/crypto_kem/kyber1024/aarch64/symmetric-shake.c
-index 72059d4..9311d5d 100644
---- a/crypto_kem/kyber1024/aarch64/symmetric-shake.c
-+++ b/crypto_kem/kyber1024/aarch64/symmetric-shake.c
-@@ -15,7 +15,7 @@
- *              - uint8_t i: additional byte of input
- *              - uint8_t j: additional byte of input
- **************************************************/
--void kyber_shake128_absorb(shake128ctx *state,
-+void kyber_shake128_absorb(shake128incctx *state,
-                            const uint8_t seed[KYBER_SYMBYTES],
-                            uint8_t x,
-                            uint8_t y) {
-@@ -25,7 +25,7 @@ void kyber_shake128_absorb(shake128ctx *state,
-     extseed[KYBER_SYMBYTES + 0] = x;
-     extseed[KYBER_SYMBYTES + 1] = y;
- 
--    shake128_absorb(state, extseed, sizeof(extseed));
-+    shake128_absorb_once(state, extseed, sizeof(extseed));
- }
- 
- /*************************************************
-diff --git a/crypto_kem/kyber1024/aarch64/symmetric.h b/crypto_kem/kyber1024/aarch64/symmetric.h
-index 471995c..ac0a783 100644
---- a/crypto_kem/kyber1024/aarch64/symmetric.h
-+++ b/crypto_kem/kyber1024/aarch64/symmetric.h
-@@ -8,12 +8,12 @@
- 
- #include "fips202.h"
- 
--typedef shake128ctx xof_state;
-+typedef shake128incctx xof_state;
- 
- 
- 
- #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
--void kyber_shake128_absorb(shake128ctx *s,
-+void kyber_shake128_absorb(shake128incctx *s,
-                            const uint8_t seed[KYBER_SYMBYTES],
-                            uint8_t x,
-                            uint8_t y);
-diff --git a/crypto_kem/kyber512/aarch64/fips202x2.h b/crypto_kem/kyber512/aarch64/fips202x2.h
-index 3b9dd97..426988d 100644
---- a/crypto_kem/kyber512/aarch64/fips202x2.h
-+++ b/crypto_kem/kyber512/aarch64/fips202x2.h
-@@ -5,12 +5,9 @@
- #include <arm_neon.h>
- #include <stddef.h>
- 
--typedef uint64x2_t v128;
-+#include "fips202.h"
- 
--#define SHAKE128_RATE 168
--#define SHAKE256_RATE 136
--#define SHA3_256_RATE 136
--#define SHA3_512_RATE 72
-+typedef uint64x2_t v128;
- 
- 
- typedef struct {
-diff --git a/crypto_kem/kyber512/aarch64/symmetric-shake.c b/crypto_kem/kyber512/aarch64/symmetric-shake.c
-index 72059d4..9311d5d 100644
---- a/crypto_kem/kyber512/aarch64/symmetric-shake.c
-+++ b/crypto_kem/kyber512/aarch64/symmetric-shake.c
-@@ -15,7 +15,7 @@
- *              - uint8_t i: additional byte of input
- *              - uint8_t j: additional byte of input
- **************************************************/
--void kyber_shake128_absorb(shake128ctx *state,
-+void kyber_shake128_absorb(shake128incctx *state,
-                            const uint8_t seed[KYBER_SYMBYTES],
-                            uint8_t x,
-                            uint8_t y) {
-@@ -25,7 +25,7 @@ void kyber_shake128_absorb(shake128ctx *state,
-     extseed[KYBER_SYMBYTES + 0] = x;
-     extseed[KYBER_SYMBYTES + 1] = y;
- 
--    shake128_absorb(state, extseed, sizeof(extseed));
-+    shake128_absorb_once(state, extseed, sizeof(extseed));
- }
- 
- /*************************************************
-diff --git a/crypto_kem/kyber512/aarch64/symmetric.h b/crypto_kem/kyber512/aarch64/symmetric.h
-index 471995c..ac0a783 100644
---- a/crypto_kem/kyber512/aarch64/symmetric.h
-+++ b/crypto_kem/kyber512/aarch64/symmetric.h
-@@ -8,12 +8,12 @@
- 
- #include "fips202.h"
- 
--typedef shake128ctx xof_state;
-+typedef shake128incctx xof_state;
- 
- 
- 
- #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
--void kyber_shake128_absorb(shake128ctx *s,
-+void kyber_shake128_absorb(shake128incctx *s,
-                            const uint8_t seed[KYBER_SYMBYTES],
-                            uint8_t x,
-                            uint8_t y);
-diff --git a/crypto_kem/kyber768/aarch64/fips202x2.h b/crypto_kem/kyber768/aarch64/fips202x2.h
-index 3b9dd97..7cffd7b 100644
---- a/crypto_kem/kyber768/aarch64/fips202x2.h
-+++ b/crypto_kem/kyber768/aarch64/fips202x2.h
-@@ -5,13 +5,9 @@
- #include <arm_neon.h>
- #include <stddef.h>
- 
--typedef uint64x2_t v128;
--
--#define SHAKE128_RATE 168
--#define SHAKE256_RATE 136
--#define SHA3_256_RATE 136
--#define SHA3_512_RATE 72
-+#include "fips202.h"
- 
-+typedef uint64x2_t v128;
- 
- typedef struct {
-     v128 s[25];
-diff --git a/crypto_kem/kyber768/aarch64/indcpa.c b/crypto_kem/kyber768/aarch64/indcpa.c
-index 24887e2..c273dc4 100644
---- a/crypto_kem/kyber768/aarch64/indcpa.c
-+++ b/crypto_kem/kyber768/aarch64/indcpa.c
-@@ -135,6 +135,7 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S
-     int16_t *s1 = NULL, *s2 = NULL;
-     unsigned int x1, x2, y1, y2;
-     xof_state c_state;
-+    xof_init(&c_state);
- 
-     for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2) {
-         switch (j) {
-@@ -222,7 +223,7 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S
-         buflen = off + XOF_BLOCKBYTES;
-         ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
-     }
--    shake128_ctx_release(&c_state);
-+    shake128_inc_ctx_release(&c_state);
- 
- }
- 
-diff --git a/crypto_kem/kyber768/aarch64/symmetric-shake.c b/crypto_kem/kyber768/aarch64/symmetric-shake.c
-index 72059d4..9311d5d 100644
---- a/crypto_kem/kyber768/aarch64/symmetric-shake.c
-+++ b/crypto_kem/kyber768/aarch64/symmetric-shake.c
-@@ -15,7 +15,7 @@
- *              - uint8_t i: additional byte of input
- *              - uint8_t j: additional byte of input
- **************************************************/
--void kyber_shake128_absorb(shake128ctx *state,
-+void kyber_shake128_absorb(shake128incctx *state,
-                            const uint8_t seed[KYBER_SYMBYTES],
-                            uint8_t x,
-                            uint8_t y) {
-@@ -25,7 +25,7 @@ void kyber_shake128_absorb(shake128ctx *state,
-     extseed[KYBER_SYMBYTES + 0] = x;
-     extseed[KYBER_SYMBYTES + 1] = y;
- 
--    shake128_absorb(state, extseed, sizeof(extseed));
-+    shake128_absorb_once(state, extseed, sizeof(extseed));
- }
- 
- /*************************************************
-diff --git a/crypto_kem/kyber768/aarch64/symmetric.h b/crypto_kem/kyber768/aarch64/symmetric.h
-index 471995c..7da2246 100644
---- a/crypto_kem/kyber768/aarch64/symmetric.h
-+++ b/crypto_kem/kyber768/aarch64/symmetric.h
-@@ -8,12 +8,12 @@
- 
- #include "fips202.h"
- 
--typedef shake128ctx xof_state;
-+typedef shake128incctx xof_state;
- 
- 
- 
- #define kyber_shake128_absorb KYBER_NAMESPACE(kyber_shake128_absorb)
--void kyber_shake128_absorb(shake128ctx *s,
-+void kyber_shake128_absorb(shake128incctx *s,
-                            const uint8_t seed[KYBER_SYMBYTES],
-                            uint8_t x,
-                            uint8_t y);
-@@ -25,6 +25,7 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM
- 
- #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
- #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-+#define xof_init(STATE) shake128_inc_init(STATE)
- #define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
- #define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
- #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
--- 
-2.25.1
-
diff --git a/scripts/copy_from_upstream/requirements.txt b/scripts/copy_from_upstream/requirements.txt
index 282f3df..df71bb5 100644
--- a/scripts/copy_from_upstream/requirements.txt
+++ b/scripts/copy_from_upstream/requirements.txt
@@ -5,7 +5,7 @@
 markdown-it-py==2.2.0
 MarkupSafe==1.1.1
 mdit-py-plugins==0.3.4
-PyYAML==5.4.1
+PyYAML==6.0.1
 tabulate==0.8.10
 typing-extensions==3.7.4.3
 wget==3.2
diff --git a/src/common/aes/aes256_armv8.c b/src/common/aes/aes256_armv8.c
index 0a9f254..24dbe0b 100644
--- a/src/common/aes/aes256_armv8.c
+++ b/src/common/aes/aes256_armv8.c
@@ -70,7 +70,7 @@
 }
 
 void oqs_aes256_enc_sch_block_armv8(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) {
-	const unsigned char *schedule = (const unsigned char *) ((aes256ctx_nobitslice *) _schedule)->sk_exp;
+	const unsigned char *schedule = (const unsigned char *) ((const aes256ctx_nobitslice *) _schedule)->sk_exp;
 	aes256_armv8_encrypt(schedule, plaintext, ciphertext);
 }
 
diff --git a/src/kem/kyber/CMakeLists.txt b/src/kem/kyber/CMakeLists.txt
index 0620b5b..5498547 100644
--- a/src/kem/kyber/CMakeLists.txt
+++ b/src/kem/kyber/CMakeLists.txt
@@ -24,7 +24,7 @@
 endif()
 
 if(OQS_ENABLE_KEM_kyber_512_aarch64)
-    add_library(kyber_512_aarch64 OBJECT pqclean_kyber512_aarch64/__asm_base_mul.S pqclean_kyber512_aarch64/__asm_iNTT.S pqclean_kyber512_aarch64/__asm_NTT.S pqclean_kyber512_aarch64/__asm_poly.S pqclean_kyber512_aarch64/cbd.c pqclean_kyber512_aarch64/fips202x2.c pqclean_kyber512_aarch64/indcpa.c pqclean_kyber512_aarch64/kem.c pqclean_kyber512_aarch64/neon_poly.c pqclean_kyber512_aarch64/neon_polyvec.c pqclean_kyber512_aarch64/neon_symmetric-shake.c pqclean_kyber512_aarch64/ntt.c pqclean_kyber512_aarch64/poly.c pqclean_kyber512_aarch64/polyvec.c pqclean_kyber512_aarch64/reduce.c pqclean_kyber512_aarch64/rejsample.c pqclean_kyber512_aarch64/symmetric-shake.c pqclean_kyber512_aarch64/verify.c)
+    add_library(kyber_512_aarch64 OBJECT pqclean_kyber512_aarch64/__asm_base_mul.S pqclean_kyber512_aarch64/__asm_iNTT.S pqclean_kyber512_aarch64/__asm_NTT.S pqclean_kyber512_aarch64/__asm_poly.S pqclean_kyber512_aarch64/cbd.c pqclean_kyber512_aarch64/feat.S pqclean_kyber512_aarch64/fips202x2.c pqclean_kyber512_aarch64/indcpa.c pqclean_kyber512_aarch64/kem.c pqclean_kyber512_aarch64/neon_poly.c pqclean_kyber512_aarch64/neon_polyvec.c pqclean_kyber512_aarch64/neon_symmetric-shake.c pqclean_kyber512_aarch64/ntt.c pqclean_kyber512_aarch64/poly.c pqclean_kyber512_aarch64/polyvec.c pqclean_kyber512_aarch64/reduce.c pqclean_kyber512_aarch64/rejsample.c pqclean_kyber512_aarch64/symmetric-shake.c pqclean_kyber512_aarch64/verify.c)
     target_include_directories(kyber_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_kyber512_aarch64)
     target_include_directories(kyber_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     if (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
@@ -52,7 +52,7 @@
 endif()
 
 if(OQS_ENABLE_KEM_kyber_768_aarch64)
-    add_library(kyber_768_aarch64 OBJECT pqclean_kyber768_aarch64/__asm_base_mul.S pqclean_kyber768_aarch64/__asm_iNTT.S pqclean_kyber768_aarch64/__asm_NTT.S pqclean_kyber768_aarch64/__asm_poly.S pqclean_kyber768_aarch64/cbd.c pqclean_kyber768_aarch64/fips202x2.c pqclean_kyber768_aarch64/indcpa.c pqclean_kyber768_aarch64/kem.c pqclean_kyber768_aarch64/neon_poly.c pqclean_kyber768_aarch64/neon_polyvec.c pqclean_kyber768_aarch64/neon_symmetric-shake.c pqclean_kyber768_aarch64/ntt.c pqclean_kyber768_aarch64/poly.c pqclean_kyber768_aarch64/polyvec.c pqclean_kyber768_aarch64/reduce.c pqclean_kyber768_aarch64/rejsample.c pqclean_kyber768_aarch64/symmetric-shake.c pqclean_kyber768_aarch64/verify.c)
+    add_library(kyber_768_aarch64 OBJECT pqclean_kyber768_aarch64/__asm_base_mul.S pqclean_kyber768_aarch64/__asm_iNTT.S pqclean_kyber768_aarch64/__asm_NTT.S pqclean_kyber768_aarch64/__asm_poly.S pqclean_kyber768_aarch64/cbd.c pqclean_kyber768_aarch64/feat.S pqclean_kyber768_aarch64/fips202x2.c pqclean_kyber768_aarch64/indcpa.c pqclean_kyber768_aarch64/kem.c pqclean_kyber768_aarch64/neon_poly.c pqclean_kyber768_aarch64/neon_polyvec.c pqclean_kyber768_aarch64/neon_symmetric-shake.c pqclean_kyber768_aarch64/ntt.c pqclean_kyber768_aarch64/poly.c pqclean_kyber768_aarch64/polyvec.c pqclean_kyber768_aarch64/reduce.c pqclean_kyber768_aarch64/rejsample.c pqclean_kyber768_aarch64/symmetric-shake.c pqclean_kyber768_aarch64/verify.c)
     target_include_directories(kyber_768_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_kyber768_aarch64)
     target_include_directories(kyber_768_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     if (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
@@ -80,7 +80,7 @@
 endif()
 
 if(OQS_ENABLE_KEM_kyber_1024_aarch64)
-    add_library(kyber_1024_aarch64 OBJECT pqclean_kyber1024_aarch64/__asm_base_mul.S pqclean_kyber1024_aarch64/__asm_iNTT.S pqclean_kyber1024_aarch64/__asm_NTT.S pqclean_kyber1024_aarch64/__asm_poly.S pqclean_kyber1024_aarch64/cbd.c pqclean_kyber1024_aarch64/fips202x2.c pqclean_kyber1024_aarch64/indcpa.c pqclean_kyber1024_aarch64/kem.c pqclean_kyber1024_aarch64/neon_poly.c pqclean_kyber1024_aarch64/neon_polyvec.c pqclean_kyber1024_aarch64/neon_symmetric-shake.c pqclean_kyber1024_aarch64/ntt.c pqclean_kyber1024_aarch64/poly.c pqclean_kyber1024_aarch64/polyvec.c pqclean_kyber1024_aarch64/reduce.c pqclean_kyber1024_aarch64/rejsample.c pqclean_kyber1024_aarch64/symmetric-shake.c pqclean_kyber1024_aarch64/verify.c)
+    add_library(kyber_1024_aarch64 OBJECT pqclean_kyber1024_aarch64/__asm_base_mul.S pqclean_kyber1024_aarch64/__asm_iNTT.S pqclean_kyber1024_aarch64/__asm_NTT.S pqclean_kyber1024_aarch64/__asm_poly.S pqclean_kyber1024_aarch64/cbd.c pqclean_kyber1024_aarch64/feat.S pqclean_kyber1024_aarch64/fips202x2.c pqclean_kyber1024_aarch64/indcpa.c pqclean_kyber1024_aarch64/kem.c pqclean_kyber1024_aarch64/neon_poly.c pqclean_kyber1024_aarch64/neon_polyvec.c pqclean_kyber1024_aarch64/neon_symmetric-shake.c pqclean_kyber1024_aarch64/ntt.c pqclean_kyber1024_aarch64/poly.c pqclean_kyber1024_aarch64/polyvec.c pqclean_kyber1024_aarch64/reduce.c pqclean_kyber1024_aarch64/rejsample.c pqclean_kyber1024_aarch64/symmetric-shake.c pqclean_kyber1024_aarch64/verify.c)
     target_include_directories(kyber_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_kyber1024_aarch64)
     target_include_directories(kyber_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     if (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/Makefile b/src/kem/kyber/pqclean_kyber1024_aarch64/Makefile
deleted file mode 100644
index c6a98d3..0000000
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-# This Makefile can be used with GNU Make or BSD Make
-
-LIB=libkyber1024_aarch64.a
-HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h 
-OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o 
-
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
-
-all: $(LIB)
-
-%.o: %.c $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-%.o: %.S $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-$(LIB): $(OBJECTS)
-	$(AR) -r $@ $(OBJECTS)
-
-clean:
-	$(RM) $(OBJECTS)
-	$(RM) $(LIB)
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/NTT_params.h b/src/kem/kyber/pqclean_kyber1024_aarch64/NTT_params.h
index 49edeb9..77dae1f 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/NTT_params.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/NTT_params.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_PARAMS_H
 #define NTT_PARAMS_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_NTT.S b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_NTT.S
index bf693c5..0469fcd 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_NTT.S
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_NTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top
-.global _PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top:
-_PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top:
+.global PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top
+.global _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top
+PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top:
+_PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -171,13 +194,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot
-.global _PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot:
-_PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot:
+.global PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot
+.global _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot
+PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot:
+_PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_base_mul.S b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_base_mul.S
index 3eed305..1b7aed0 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_base_mul.S
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_base_mul.S
@@ -1,16 +1,39 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 #include "params.h"
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended
-.global _PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended
-#if !defined(__clang__) && !defined(old_gas_syntax)
-  .type PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended:
-_PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended:
+.global PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended
+.global _PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended
+PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended:
+_PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended:
 
     push_all
     Q         .req w20
@@ -71,13 +94,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul
-.global _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul:
-_PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul:
+.global PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul
+.global _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul
+PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul:
+_PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul:
 
     push_all
     Q         .req w28
@@ -226,13 +246,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery
-.global _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery:
-_PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery:
+.global PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery
+.global _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery
+PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery:
+_PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery:
 
     push_all
     Q         .req w28
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_iNTT.S b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_iNTT.S
index f7e83ab..930b519 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_iNTT.S
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_iNTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot
-.global _PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot:
-_PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot:
+.global PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot
+.global _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot
+PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot:
+_PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot:
 
     push_all
     Q         .req w20
@@ -88,13 +111,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top
-.global _PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top:
-_PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top:
+.global PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top
+.global _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top
+PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top:
+_PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top:
 
     push_all
     Q         .req w20
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_poly.S b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_poly.S
index 34a38bb..a5e3e7c 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_poly.S
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/__asm_poly.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_add_reduce
-.global _PQCLEAN_KYBER1024_AARCH64_asm_add_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_add_reduce, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_add_reduce:
-_PQCLEAN_KYBER1024_AARCH64_asm_add_reduce:
+.global PQCLEAN_KYBER1024_AARCH64__asm_add_reduce
+.global _PQCLEAN_KYBER1024_AARCH64__asm_add_reduce
+PQCLEAN_KYBER1024_AARCH64__asm_add_reduce:
+_PQCLEAN_KYBER1024_AARCH64__asm_add_reduce:
 
     mov w4, #3329
     mov w5, #25519
@@ -66,13 +89,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce
-.global _PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce:
-_PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce:
+.global PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce
+.global _PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce
+PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce:
+_PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce:
 
     mov w4, #3329
     mov w5, #25519
@@ -130,13 +150,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce
-.global _PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce, %function
-#endif
-PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce:
-_PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce:
+.global PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce
+.global _PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce
+PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce:
+_PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce:
 
     mov w4, #3329
     mov w5, #25519
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/api.h b/src/kem/kyber/pqclean_kyber1024_aarch64/api.h
index c9008c2..b6db980 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/api.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/api.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+
 #ifndef PQCLEAN_KYBER1024_AARCH64_API_H
 #define PQCLEAN_KYBER1024_AARCH64_API_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.c b/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.c
index f6d9bf3..a1c98b6 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.c
@@ -1,7 +1,15 @@
-#include "cbd.h"
-#include "params.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #include <arm_neon.h>
 #include <stdint.h>
+#include "params.h"
+#include "cbd.h"
 
 #define vload2(c, ptr) c = vld2q_u8(ptr);
 
@@ -23,7 +31,8 @@
 #define vsublh8(c, a, b) c = (int16x8_t)vsubl_high_u8(a, b);
 
 static
-void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) {
+void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4])
+{
     uint8x16x2_t t, d;      // 4
     uint8x16x2_t a, b;      // 4
     int16x8x4_t res1, res2; // 4
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.h b/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.h
index e1d2fb5..8a1cee5 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/cbd.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef CBD_H
 #define CBD_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/feat.S b/src/kem/kyber/pqclean_kyber1024_aarch64/feat.S
new file mode 100644
index 0000000..d7dda5b
--- /dev/null
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/feat.S
@@ -0,0 +1,168 @@
+
+/*
+MIT License
+
+Copyright (c) 2020 Bas Westerbaan
+Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3)
+
+.macro round
+    ; Execute theta, but without xoring into the state yet.
+    ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
+    eor3.16b v25, v0, v5, v10
+    eor3.16b v26, v1, v6, v11
+    eor3.16b v27, v2, v7, v12
+    eor3.16b v28, v3, v8, v13
+    eor3.16b v29, v4, v9, v14
+
+    eor3.16b v25, v25, v15, v20
+    eor3.16b v26, v26, v16, v21
+    eor3.16b v27, v27, v17, v22
+    eor3.16b v28, v28, v18, v23
+    eor3.16b v29, v29, v19, v24
+
+    rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4]
+    rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2]
+    rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0]
+    rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3]
+    rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1]
+
+    ; Xor parities from step theta into the state at the same time
+    ; as executing rho and pi.
+    eor.16b v0, v0,  v30
+    mov.16b v31, v1
+    xar.2d v1,  v6,  v27, 20
+    xar.2d v6,  v9,  v25, 44
+    xar.2d v9,  v22, v28, 3
+    xar.2d v22, v14, v25, 25
+    xar.2d v14, v20, v30, 46
+    xar.2d v20, v2,  v28, 2
+    xar.2d v2,  v12, v28, 21
+    xar.2d v12, v13, v29, 39
+    xar.2d v13, v19, v25, 56
+    xar.2d v19, v23, v29, 8
+    xar.2d v23, v15, v30, 23
+    xar.2d v15, v4,  v25, 37
+    xar.2d v4,  v24, v25, 50
+    xar.2d v24, v21, v27, 62
+    xar.2d v21, v8,  v29, 9
+    xar.2d v8,  v16, v27, 19
+    xar.2d v16, v5,  v30, 28
+    xar.2d v5,  v3,  v29, 36
+    xar.2d v3,  v18, v29, 43
+    xar.2d v18, v17, v28, 49
+    xar.2d v17, v11, v27, 54
+    xar.2d v11, v7,  v28, 58
+    xar.2d v7,  v10, v30, 61
+    xar.2d v10, v31, v27, 63
+
+    ; Chi
+    bcax.16b v25, v0,  v2,  v1
+    bcax.16b v26, v1,  v3,  v2
+    bcax.16b v2,  v2,  v4,  v3
+    bcax.16b v3,  v3,  v0,  v4
+    bcax.16b v4,  v4,  v1,  v0
+    mov.16b v0, v25
+    mov.16b v1, v26
+
+    bcax.16b v25, v5,  v7,  v6
+    bcax.16b v26, v6,  v8,  v7
+    bcax.16b v7,  v7,  v9,  v8
+    bcax.16b v8,  v8,  v5,  v9
+    bcax.16b v9,  v9,  v6,  v5
+    mov.16b v5, v25
+    mov.16b v6, v26
+
+    bcax.16b v25, v10,  v12,  v11
+    bcax.16b v26, v11,  v13,  v12
+    bcax.16b v12, v12,  v14,  v13
+    bcax.16b v13, v13,  v10,  v14
+    bcax.16b v14, v14,  v11,  v10
+    mov.16b v10, v25
+    mov.16b v11, v26
+
+    bcax.16b v25, v15,  v17,  v16
+    bcax.16b v26, v16,  v18,  v17
+    bcax.16b v17, v17,  v19,  v18
+    bcax.16b v18, v18,  v15,  v19
+    bcax.16b v19, v19,  v16,  v15
+    mov.16b v15, v25
+    mov.16b v16, v26
+
+    bcax.16b v25, v20,  v22,  v21
+    bcax.16b v26, v21,  v23,  v22
+    bcax.16b v22, v22,  v24,  v23
+    bcax.16b v23, v23,  v20,  v24
+    bcax.16b v24, v24,  v21,  v20
+    mov.16b v20, v25
+    mov.16b v21, v26
+
+    ; iota
+    ld1r {v25.2d}, [x1], #8
+    eor.16b v0, v0, v25
+.endm
+
+.align 4
+.global PQCLEAN_KYBER1024_AARCH64_f1600x2
+.global _PQCLEAN_KYBER1024_AARCH64_f1600x2
+PQCLEAN_KYBER1024_AARCH64_f1600x2:
+_PQCLEAN_KYBER1024_AARCH64_f1600x2:
+    stp d8,  d9,  [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+
+    mov x2, x0
+    mov x3, #24
+
+    ld1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    ld1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    ld1.2d {v8,  v9,  v10, v11}, [x0], #64
+    ld1.2d {v12, v13, v14, v15}, [x0], #64
+    ld1.2d {v16, v17, v18, v19}, [x0], #64
+    ld1.2d {v20, v21, v22, v23}, [x0], #64
+    ld1.2d {v24}, [x0]
+
+loop:
+    round
+
+    subs x3, x3, #1
+    cbnz x3, loop
+
+    mov x0, x2
+    st1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    st1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    st1.2d {v8,  v9,  v10, v11}, [x0], #64
+    st1.2d {v12, v13, v14, v15}, [x0], #64
+    st1.2d {v16, v17, v18, v19}, [x0], #64
+    st1.2d {v20, v21, v22, v23}, [x0], #64
+    st1.2d {v24}, [x0]
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8,  d9,  [sp], #16
+
+    ret lr
+
+#endif
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.c b/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.c
index 3924900..a50a580 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.c
@@ -1,6 +1,40 @@
-#include "fips202x2.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
 #include <stddef.h>
+#include "fips202x2.h"
 
 
 #define NROUNDS 24
@@ -67,275 +101,282 @@
 *
 * Arguments:   - uint64_t *state: pointer to input/output Keccak state
 **************************************************/
+extern void PQCLEAN_KYBER1024_AARCH64_f1600x2(v128*, const uint64_t*);
 static inline
-void KeccakF1600_StatePermutex2(v128 state[25]) {
-    v128 Aba, Abe, Abi, Abo, Abu;
-    v128 Aga, Age, Agi, Ago, Agu;
-    v128 Aka, Ake, Aki, Ako, Aku;
-    v128 Ama, Ame, Ami, Amo, Amu;
-    v128 Asa, Ase, Asi, Aso, Asu;
-    v128 BCa, BCe, BCi, BCo, BCu; // tmp
-    v128 Da, De, Di, Do, Du;      // D
-    v128 Eba, Ebe, Ebi, Ebo, Ebu;
-    v128 Ega, Ege, Egi, Ego, Egu;
-    v128 Eka, Eke, Eki, Eko, Eku;
-    v128 Ema, Eme, Emi, Emo, Emu;
-    v128 Esa, Ese, Esi, Eso, Esu;
+void KeccakF1600_StatePermutex2(v128 state[25])
+{
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+  PQCLEAN_KYBER1024_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+#else
+  v128 Aba, Abe, Abi, Abo, Abu;
+  v128 Aga, Age, Agi, Ago, Agu;
+  v128 Aka, Ake, Aki, Ako, Aku;
+  v128 Ama, Ame, Ami, Amo, Amu;
+  v128 Asa, Ase, Asi, Aso, Asu;
+  v128 BCa, BCe, BCi, BCo, BCu; // tmp
+  v128 Da, De, Di, Do, Du;      // D
+  v128 Eba, Ebe, Ebi, Ebo, Ebu;
+  v128 Ega, Ege, Egi, Ego, Egu;
+  v128 Eka, Eke, Eki, Eko, Eku;
+  v128 Ema, Eme, Emi, Emo, Emu;
+  v128 Esa, Ese, Esi, Eso, Esu;
 
-    //copyFromState(A, state)
-    Aba = state[0];
-    Abe = state[1];
-    Abi = state[2];
-    Abo = state[3];
-    Abu = state[4];
-    Aga = state[5];
-    Age = state[6];
-    Agi = state[7];
-    Ago = state[8];
-    Agu = state[9];
-    Aka = state[10];
-    Ake = state[11];
-    Aki = state[12];
-    Ako = state[13];
-    Aku = state[14];
-    Ama = state[15];
-    Ame = state[16];
-    Ami = state[17];
-    Amo = state[18];
-    Amu = state[19];
-    Asa = state[20];
-    Ase = state[21];
-    Asi = state[22];
-    Aso = state[23];
-    Asu = state[24];
+  //copyFromState(A, state)
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
 
-    for (int round = 0; round < NROUNDS; round += 2) {
-        //    prepareTheta
-        vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
-        vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
-        vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
-        vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
-        vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
+  for (int round = 0; round < NROUNDS; round += 2)
+  {
+    //    prepareTheta
+    vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
+    vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
+    vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
+    vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
+    vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Aba, Aba, Da);
-        vxor(Age, Age, De);
-        vROL(BCe, Age, 44);
-        vxor(Aki, Aki, Di);
-        vROL(BCi, Aki, 43);
-        vxor(Amo, Amo, Do);
-        vROL(BCo, Amo, 21);
-        vxor(Asu, Asu, Du);
-        vROL(BCu, Asu, 14);
-        vXNA(Eba, Aba, BCe, BCi);
-        vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
-        vXNA(Ebe, BCe, BCi, BCo);
-        vXNA(Ebi, BCi, BCo, BCu);
-        vXNA(Ebo, BCo, BCu, Aba);
-        vXNA(Ebu, BCu, Aba, BCe);
+    vxor(Aba, Aba, Da);
+    vxor(Age, Age, De);
+    vROL(BCe, Age, 44);
+    vxor(Aki, Aki, Di);
+    vROL(BCi, Aki, 43);
+    vxor(Amo, Amo, Do);
+    vROL(BCo, Amo, 21);
+    vxor(Asu, Asu, Du);
+    vROL(BCu, Asu, 14);
+    vXNA(Eba, Aba, BCe, BCi);
+    vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
+    vXNA(Ebe, BCe, BCi, BCo);
+    vXNA(Ebi, BCi, BCo, BCu);
+    vXNA(Ebo, BCo, BCu, Aba);
+    vXNA(Ebu, BCu, Aba, BCe);
 
-        vxor(Abo, Abo, Do);
-        vROL(BCa, Abo, 28);
-        vxor(Agu, Agu, Du);
-        vROL(BCe, Agu, 20);
-        vxor(Aka, Aka, Da);
-        vROL(BCi, Aka, 3);
-        vxor(Ame, Ame, De);
-        vROL(BCo, Ame, 45);
-        vxor(Asi, Asi, Di);
-        vROL(BCu, Asi, 61);
-        vXNA(Ega, BCa, BCe, BCi);
-        vXNA(Ege, BCe, BCi, BCo);
-        vXNA(Egi, BCi, BCo, BCu);
-        vXNA(Ego, BCo, BCu, BCa);
-        vXNA(Egu, BCu, BCa, BCe);
+    vxor(Abo, Abo, Do);
+    vROL(BCa, Abo, 28);
+    vxor(Agu, Agu, Du);
+    vROL(BCe, Agu, 20);
+    vxor(Aka, Aka, Da);
+    vROL(BCi, Aka, 3);
+    vxor(Ame, Ame, De);
+    vROL(BCo, Ame, 45);
+    vxor(Asi, Asi, Di);
+    vROL(BCu, Asi, 61);
+    vXNA(Ega, BCa, BCe, BCi);
+    vXNA(Ege, BCe, BCi, BCo);
+    vXNA(Egi, BCi, BCo, BCu);
+    vXNA(Ego, BCo, BCu, BCa);
+    vXNA(Egu, BCu, BCa, BCe);
 
-        vxor(Abe, Abe, De);
-        vROL(BCa, Abe, 1);
-        vxor(Agi, Agi, Di);
-        vROL(BCe, Agi, 6);
-        vxor(Ako, Ako, Do);
-        vROL(BCi, Ako, 25);
-        vxor(Amu, Amu, Du);
-        vROL(BCo, Amu, 8);
-        vxor(Asa, Asa, Da);
-        vROL(BCu, Asa, 18);
-        vXNA(Eka, BCa, BCe, BCi);
-        vXNA(Eke, BCe, BCi, BCo);
-        vXNA(Eki, BCi, BCo, BCu);
-        vXNA(Eko, BCo, BCu, BCa);
-        vXNA(Eku, BCu, BCa, BCe);
+    vxor(Abe, Abe, De);
+    vROL(BCa, Abe, 1);
+    vxor(Agi, Agi, Di);
+    vROL(BCe, Agi, 6);
+    vxor(Ako, Ako, Do);
+    vROL(BCi, Ako, 25);
+    vxor(Amu, Amu, Du);
+    vROL(BCo, Amu, 8);
+    vxor(Asa, Asa, Da);
+    vROL(BCu, Asa, 18);
+    vXNA(Eka, BCa, BCe, BCi);
+    vXNA(Eke, BCe, BCi, BCo);
+    vXNA(Eki, BCi, BCo, BCu);
+    vXNA(Eko, BCo, BCu, BCa);
+    vXNA(Eku, BCu, BCa, BCe);
 
-        vxor(Abu, Abu, Du);
-        vROL(BCa, Abu, 27);
-        vxor(Aga, Aga, Da);
-        vROL(BCe, Aga, 36);
-        vxor(Ake, Ake, De);
-        vROL(BCi, Ake, 10);
-        vxor(Ami, Ami, Di);
-        vROL(BCo, Ami, 15);
-        vxor(Aso, Aso, Do);
-        vROL(BCu, Aso, 56);
-        vXNA(Ema, BCa, BCe, BCi);
-        vXNA(Eme, BCe, BCi, BCo);
-        vXNA(Emi, BCi, BCo, BCu);
-        vXNA(Emo, BCo, BCu, BCa);
-        vXNA(Emu, BCu, BCa, BCe);
+    vxor(Abu, Abu, Du);
+    vROL(BCa, Abu, 27);
+    vxor(Aga, Aga, Da);
+    vROL(BCe, Aga, 36);
+    vxor(Ake, Ake, De);
+    vROL(BCi, Ake, 10);
+    vxor(Ami, Ami, Di);
+    vROL(BCo, Ami, 15);
+    vxor(Aso, Aso, Do);
+    vROL(BCu, Aso, 56);
+    vXNA(Ema, BCa, BCe, BCi);
+    vXNA(Eme, BCe, BCi, BCo);
+    vXNA(Emi, BCi, BCo, BCu);
+    vXNA(Emo, BCo, BCu, BCa);
+    vXNA(Emu, BCu, BCa, BCe);
 
-        vxor(Abi, Abi, Di);
-        vROL(BCa, Abi, 62);
-        vxor(Ago, Ago, Do);
-        vROL(BCe, Ago, 55);
-        vxor(Aku, Aku, Du);
-        vROL(BCi, Aku, 39);
-        vxor(Ama, Ama, Da);
-        vROL(BCo, Ama, 41);
-        vxor(Ase, Ase, De);
-        vROL(BCu, Ase, 2);
-        vXNA(Esa, BCa, BCe, BCi);
-        vXNA(Ese, BCe, BCi, BCo);
-        vXNA(Esi, BCi, BCo, BCu);
-        vXNA(Eso, BCo, BCu, BCa);
-        vXNA(Esu, BCu, BCa, BCe);
+    vxor(Abi, Abi, Di);
+    vROL(BCa, Abi, 62);
+    vxor(Ago, Ago, Do);
+    vROL(BCe, Ago, 55);
+    vxor(Aku, Aku, Du);
+    vROL(BCi, Aku, 39);
+    vxor(Ama, Ama, Da);
+    vROL(BCo, Ama, 41);
+    vxor(Ase, Ase, De);
+    vROL(BCu, Ase, 2);
+    vXNA(Esa, BCa, BCe, BCi);
+    vXNA(Ese, BCe, BCi, BCo);
+    vXNA(Esi, BCi, BCo, BCu);
+    vXNA(Eso, BCo, BCu, BCa);
+    vXNA(Esu, BCu, BCa, BCe);
 
-        // Next Round
+    // Next Round
 
-        //    prepareTheta
-        vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
-        vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
-        vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
-        vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
-        vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
+    //    prepareTheta
+    vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
+    vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
+    vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
+    vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
+    vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Eba, Eba, Da);
-        vxor(Ege, Ege, De);
-        vROL(BCe, Ege, 44);
-        vxor(Eki, Eki, Di);
-        vROL(BCi, Eki, 43);
-        vxor(Emo, Emo, Do);
-        vROL(BCo, Emo, 21);
-        vxor(Esu, Esu, Du);
-        vROL(BCu, Esu, 14);
-        vXNA(Aba, Eba, BCe, BCi);
-        vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
-        vXNA(Abe, BCe, BCi, BCo);
-        vXNA(Abi, BCi, BCo, BCu);
-        vXNA(Abo, BCo, BCu, Eba);
-        vXNA(Abu, BCu, Eba, BCe);
+    vxor(Eba, Eba, Da);
+    vxor(Ege, Ege, De);
+    vROL(BCe, Ege, 44);
+    vxor(Eki, Eki, Di);
+    vROL(BCi, Eki, 43);
+    vxor(Emo, Emo, Do);
+    vROL(BCo, Emo, 21);
+    vxor(Esu, Esu, Du);
+    vROL(BCu, Esu, 14);
+    vXNA(Aba, Eba, BCe, BCi);
+    vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
+    vXNA(Abe, BCe, BCi, BCo);
+    vXNA(Abi, BCi, BCo, BCu);
+    vXNA(Abo, BCo, BCu, Eba);
+    vXNA(Abu, BCu, Eba, BCe);
 
-        vxor(Ebo, Ebo, Do);
-        vROL(BCa, Ebo, 28);
-        vxor(Egu, Egu, Du);
-        vROL(BCe, Egu, 20);
-        vxor(Eka, Eka, Da);
-        vROL(BCi, Eka, 3);
-        vxor(Eme, Eme, De);
-        vROL(BCo, Eme, 45);
-        vxor(Esi, Esi, Di);
-        vROL(BCu, Esi, 61);
-        vXNA(Aga, BCa, BCe, BCi);
-        vXNA(Age, BCe, BCi, BCo);
-        vXNA(Agi, BCi, BCo, BCu);
-        vXNA(Ago, BCo, BCu, BCa);
-        vXNA(Agu, BCu, BCa, BCe);
+    vxor(Ebo, Ebo, Do);
+    vROL(BCa, Ebo, 28);
+    vxor(Egu, Egu, Du);
+    vROL(BCe, Egu, 20);
+    vxor(Eka, Eka, Da);
+    vROL(BCi, Eka, 3);
+    vxor(Eme, Eme, De);
+    vROL(BCo, Eme, 45);
+    vxor(Esi, Esi, Di);
+    vROL(BCu, Esi, 61);
+    vXNA(Aga, BCa, BCe, BCi);
+    vXNA(Age, BCe, BCi, BCo);
+    vXNA(Agi, BCi, BCo, BCu);
+    vXNA(Ago, BCo, BCu, BCa);
+    vXNA(Agu, BCu, BCa, BCe);
 
-        vxor(Ebe, Ebe, De);
-        vROL(BCa, Ebe, 1);
-        vxor(Egi, Egi, Di);
-        vROL(BCe, Egi, 6);
-        vxor(Eko, Eko, Do);
-        vROL(BCi, Eko, 25);
-        vxor(Emu, Emu, Du);
-        vROL(BCo, Emu, 8);
-        vxor(Esa, Esa, Da);
-        vROL(BCu, Esa, 18);
-        vXNA(Aka, BCa, BCe, BCi);
-        vXNA(Ake, BCe, BCi, BCo);
-        vXNA(Aki, BCi, BCo, BCu);
-        vXNA(Ako, BCo, BCu, BCa);
-        vXNA(Aku, BCu, BCa, BCe);
+    vxor(Ebe, Ebe, De);
+    vROL(BCa, Ebe, 1);
+    vxor(Egi, Egi, Di);
+    vROL(BCe, Egi, 6);
+    vxor(Eko, Eko, Do);
+    vROL(BCi, Eko, 25);
+    vxor(Emu, Emu, Du);
+    vROL(BCo, Emu, 8);
+    vxor(Esa, Esa, Da);
+    vROL(BCu, Esa, 18);
+    vXNA(Aka, BCa, BCe, BCi);
+    vXNA(Ake, BCe, BCi, BCo);
+    vXNA(Aki, BCi, BCo, BCu);
+    vXNA(Ako, BCo, BCu, BCa);
+    vXNA(Aku, BCu, BCa, BCe);
 
-        vxor(Ebu, Ebu, Du);
-        vROL(BCa, Ebu, 27);
-        vxor(Ega, Ega, Da);
-        vROL(BCe, Ega, 36);
-        vxor(Eke, Eke, De);
-        vROL(BCi, Eke, 10);
-        vxor(Emi, Emi, Di);
-        vROL(BCo, Emi, 15);
-        vxor(Eso, Eso, Do);
-        vROL(BCu, Eso, 56);
-        vXNA(Ama, BCa, BCe, BCi);
-        vXNA(Ame, BCe, BCi, BCo);
-        vXNA(Ami, BCi, BCo, BCu);
-        vXNA(Amo, BCo, BCu, BCa);
-        vXNA(Amu, BCu, BCa, BCe);
+    vxor(Ebu, Ebu, Du);
+    vROL(BCa, Ebu, 27);
+    vxor(Ega, Ega, Da);
+    vROL(BCe, Ega, 36);
+    vxor(Eke, Eke, De);
+    vROL(BCi, Eke, 10);
+    vxor(Emi, Emi, Di);
+    vROL(BCo, Emi, 15);
+    vxor(Eso, Eso, Do);
+    vROL(BCu, Eso, 56);
+    vXNA(Ama, BCa, BCe, BCi);
+    vXNA(Ame, BCe, BCi, BCo);
+    vXNA(Ami, BCi, BCo, BCu);
+    vXNA(Amo, BCo, BCu, BCa);
+    vXNA(Amu, BCu, BCa, BCe);
 
-        vxor(Ebi, Ebi, Di);
-        vROL(BCa, Ebi, 62);
-        vxor(Ego, Ego, Do);
-        vROL(BCe, Ego, 55);
-        vxor(Eku, Eku, Du);
-        vROL(BCi, Eku, 39);
-        vxor(Ema, Ema, Da);
-        vROL(BCo, Ema, 41);
-        vxor(Ese, Ese, De);
-        vROL(BCu, Ese, 2);
-        vXNA(Asa, BCa, BCe, BCi);
-        vXNA(Ase, BCe, BCi, BCo);
-        vXNA(Asi, BCi, BCo, BCu);
-        vXNA(Aso, BCo, BCu, BCa);
-        vXNA(Asu, BCu, BCa, BCe);
-    }
+    vxor(Ebi, Ebi, Di);
+    vROL(BCa, Ebi, 62);
+    vxor(Ego, Ego, Do);
+    vROL(BCe, Ego, 55);
+    vxor(Eku, Eku, Du);
+    vROL(BCi, Eku, 39);
+    vxor(Ema, Ema, Da);
+    vROL(BCo, Ema, 41);
+    vxor(Ese, Ese, De);
+    vROL(BCu, Ese, 2);
+    vXNA(Asa, BCa, BCe, BCi);
+    vXNA(Ase, BCe, BCi, BCo);
+    vXNA(Asi, BCi, BCo, BCu);
+    vXNA(Aso, BCo, BCu, BCa);
+    vXNA(Asu, BCu, BCa, BCe);
+  }
 
-    state[0] = Aba;
-    state[1] = Abe;
-    state[2] = Abi;
-    state[3] = Abo;
-    state[4] = Abu;
-    state[5] = Aga;
-    state[6] = Age;
-    state[7] = Agi;
-    state[8] = Ago;
-    state[9] = Agu;
-    state[10] = Aka;
-    state[11] = Ake;
-    state[12] = Aki;
-    state[13] = Ako;
-    state[14] = Aku;
-    state[15] = Ama;
-    state[16] = Ame;
-    state[17] = Ami;
-    state[18] = Amo;
-    state[19] = Amu;
-    state[20] = Asa;
-    state[21] = Ase;
-    state[22] = Asi;
-    state[23] = Aso;
-    state[24] = Asu;
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+#endif
 }
 
 /*************************************************
@@ -463,39 +504,41 @@
                             uint8_t *out1,
                             size_t nblocks,
                             unsigned int r,
-                            v128 s[25]) {
-    unsigned int i;
+                            v128 s[25]){
+  unsigned int i;
 
-    uint64x1_t a, b;
-    uint64x2x2_t a2, b2;
+  uint64x1_t a, b;
+  uint64x2x2_t a2, b2;
 
-    while (nblocks > 0) {
-        KeccakF1600_StatePermutex2(s);
+  while (nblocks > 0)
+  {
+    KeccakF1600_StatePermutex2(s);
 
-        for (i = 0; i < r / 8 - 1; i += 4) {
-            a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
-            b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
-            a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
-            b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
-            vst1q_u64_x2((uint64_t *)out0, a2);
-            vst1q_u64_x2((uint64_t *)out1, b2);
+    for (i = 0; i < r / 8 - 1; i += 4)
+    {
+      a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
+      b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
+      a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
+      b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
+      vst1q_u64_x2((uint64_t *)out0, a2);
+      vst1q_u64_x2((uint64_t *)out1, b2);
 
-            out0 += 32;
-            out1 += 32;
-        }
-
-        i = r / 8 - 1;
-        // Last iteration
-        a = vget_low_u64(s[i]);
-        b = vget_high_u64(s[i]);
-        vst1_u64((uint64_t *)out0, a);
-        vst1_u64((uint64_t *)out1, b);
-
-        out0 += 8;
-        out1 += 8;
-
-        --nblocks;
+      out0 += 32;
+      out1 += 32;
     }
+
+    i = r / 8 - 1;
+    // Last iteration
+    a = vget_low_u64(s[i]);
+    b = vget_high_u64(s[i]);
+    vst1_u64((uint64_t *)out0, a);
+    vst1_u64((uint64_t *)out1, b);
+
+    out0 += 8;
+    out1 += 8;
+
+    --nblocks;
+  }
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.h b/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.h
index 7cffd7b..a1eacdf 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/fips202x2.h
@@ -1,10 +1,17 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef FIPS202X2_H
 #define FIPS202X2_H
 
 #include "params.h"
 #include <arm_neon.h>
 #include <stddef.h>
-
 #include "fips202.h"
 
 typedef uint64x2_t v128;
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.c b/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.c
index c2748e5..43f489f 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.c
@@ -1,15 +1,50 @@
-#include "NTT_params.h"
-#include "indcpa.h"
-#include "ntt.h"
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-#include "randombytes.h"
-#include "rejsample.h"
-#include "symmetric.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+#include "params.h"
+#include "rejsample.h"
+#include "indcpa.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+
+#include "NTT_params.h"
+#include "ntt.h"
 
 /*************************************************
 * Name:        pack_pk
@@ -125,40 +160,44 @@
 **************************************************/
 #define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
 // Not static for benchmarking
-void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed) {
-    unsigned int ctr0, ctr1, k;
-    unsigned int buflen, off;
-    uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
-            buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
-    neon_xof_state state;
+void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed)
+{
+  unsigned int ctr0, ctr1, k;
+  unsigned int buflen, off;
+  uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
+      buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+  neon_xof_state state;
 
-    for (unsigned int i = 0; i < KYBER_K; i++) {
-        for (unsigned int j = 0; j < KYBER_K; j += 2) {
-            if (transposed) {
-                neon_xof_absorb(&state, seed, i, i, j, j + 1);
-            } else {
-                neon_xof_absorb(&state, seed, j, j + 1, i, i);
-            }
+  for (unsigned int i = 0; i < KYBER_K; i++)
+  {
+    for (unsigned int j = 0; j < KYBER_K; j += 2)
+    {
+      if (transposed)
+        neon_xof_absorb(&state, seed, i, i, j, j + 1);
+      else
+        neon_xof_absorb(&state, seed, j, j + 1, i, i);
 
-            neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
-            buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
-            ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0);
-            ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1);
+      neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
+      buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+      ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0);
+      ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1);
 
-            while (ctr0 < KYBER_N || ctr1 < KYBER_N) {
-                off = buflen % 3;
-                for (k = 0; k < off; k++) {
-                    buf0[k] = buf0[buflen - off + k];
-                    buf1[k] = buf1[buflen - off + k];
-                }
-                neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
-
-                buflen = off + XOF_BLOCKBYTES;
-                ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
-                ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
-            }
+      while (ctr0 < KYBER_N || ctr1 < KYBER_N)
+      {
+        off = buflen % 3;
+        for (k = 0; k < off; k++)
+        {
+          buf0[k] = buf0[buflen - off + k];
+          buf1[k] = buf1[buflen - off + k];
         }
+        neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
+
+        buflen = off + XOF_BLOCKBYTES;
+        ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
+        ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
+      }
     }
+  }
 }
 
 /*************************************************
@@ -198,11 +237,11 @@
     neon_polyvec_ntt(e);
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended(&(skpv_asymmetric[i][0]), &(skpv[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+        PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended(&(skpv_asymmetric[i][0]), &(skpv[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery(&(a[i][0][0]), &(skpv[0][0]), &(skpv_asymmetric[0][0]), asymmetric_const, pkpv[i]);
+        PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery(&(a[i][0][0]), &(skpv[0][0]), &(skpv_asymmetric[0][0]), asymmetric_const, pkpv[i]);
     }
 
     neon_polyvec_add_reduce(pkpv, e);
@@ -256,15 +295,15 @@
 
     neon_polyvec_ntt(sp);
 
-    for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended(&(sp_asymmetric[i][0]), &(sp[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+    for(i = 0; i < KYBER_K; i++){
+        PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended(&(sp_asymmetric[i][0]), &(sp[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
-    for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul(&(at[i][0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, b[i]);
+    for(i = 0; i < KYBER_K; i++){
+        PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul(&(at[i][0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, b[i]);
     }
 
-    PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul(&(pkpv[0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, v);
+    PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul(&(pkpv[0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, v);
 
     neon_polyvec_invntt_to_mont(b);
     invntt(v);
@@ -306,10 +345,10 @@
     neon_polyvec_ntt(b);
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended(&(b_asymmetric[i][0]), &(b[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+        PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended(&(b_asymmetric[i][0]), &(b[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
-    PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul(&(skpv[0][0]), &(b[0][0]), &(b_asymmetric[0][0]), asymmetric_const, mp);
+    PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul(&(skpv[0][0]), &(b[0][0]), &(b_asymmetric[0][0]), asymmetric_const, mp);
 
     invntt(mp);
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.h b/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.h
index f718f39..b74bc0b 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/indcpa.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef INDCPA_H
 #define INDCPA_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/kem.c b/src/kem/kyber/pqclean_kyber1024_aarch64/kem.c
index d8d8a0e..c8217f7 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/kem.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/kem.c
@@ -1,11 +1,18 @@
-#include "indcpa.h"
-#include "kem.h"
-#include "params.h"
-#include "randombytes.h"
-#include "symmetric.h"
-#include "verify.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "params.h"
+#include "indcpa.h"
+#include "verify.h"
+#include "symmetric.h"
+#include "randombytes.h"
+#include "kem.h"
 
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/kem.h b/src/kem/kyber/pqclean_kyber1024_aarch64/kem.h
index ba1010d..965cfee 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/kem.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/kem.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef KEM_H
 #define KEM_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/macros.inc b/src/kem/kyber/pqclean_kyber1024_aarch64/macros.inc
index 9e392b0..2add309 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/macros.inc
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/macros.inc
@@ -1,4 +1,30 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef MACROS_S
 #define MACROS_S
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/macros_common.inc b/src/kem/kyber/pqclean_kyber1024_aarch64/macros_common.inc
index 26e7cbb..c1ac021 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/macros_common.inc
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/macros_common.inc
@@ -1,4 +1,30 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef MACROS_COMMON
 #define MACROS_COMMON
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/neon_poly.c b/src/kem/kyber/pqclean_kyber1024_aarch64/neon_poly.c
index 6b6f2ab..6d787dd 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/neon_poly.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/neon_poly.c
@@ -1,9 +1,43 @@
-#include "cbd.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
 #include "poly.h"
+#include "ntt.h"
+#include "cbd.h"
 #include "symmetric.h"
-#include <arm_neon.h>
 
 
 /*************************************************
@@ -97,14 +131,14 @@
 *            - const poly *a: pointer to first input polynomial
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(int16_t *, const int16_t *);
 void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
-    PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(c, a);
+    PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(c, a);
 }
 
-extern void PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
 void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) {
-    PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(c, a, b);
+    PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(c, a, b);
 }
 
 /*************************************************
@@ -118,7 +152,7 @@
 *            - const poly *a: pointer to first input polynomial
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(int16_t *, const int16_t *);
 void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
-    PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(c, a);
+    PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(c, a);
 }
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/neon_polyvec.c b/src/kem/kyber/pqclean_kyber1024_aarch64/neon_polyvec.c
index 1af48ea..c05f59d 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/neon_polyvec.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/neon_polyvec.c
@@ -1,10 +1,45 @@
-#include "NTT_params.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
+#include "reduce.h"
+#include "ntt.h"
 #include "poly.h"
 #include "polyvec.h"
-#include "reduce.h"
-#include <arm_neon.h>
+
+#include "NTT_params.h"
 
 #define _V (((1U << 26) + KYBER_Q / 2) / KYBER_Q)
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/neon_symmetric-shake.c b/src/kem/kyber/pqclean_kyber1024_aarch64/neon_symmetric-shake.c
index 6515250..8aced5e 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/neon_symmetric-shake.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/neon_symmetric-shake.c
@@ -1,8 +1,42 @@
-#include "fips202x2.h"
-#include "params.h"
-#include "symmetric.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "params.h"
+#include "fips202x2.h"
+#include "symmetric.h"
 
 /*************************************************
 * Name:        kyber_shake128_absorb
@@ -19,22 +53,23 @@
 void neon_kyber_shake128_absorb(keccakx2_state *state,
                                 const uint8_t seed[KYBER_SYMBYTES],
                                 uint8_t x1, uint8_t x2,
-                                uint8_t y1, uint8_t y2) {
-    unsigned int i;
-    uint8_t extseed1[KYBER_SYMBYTES + 2 + 14];
-    uint8_t extseed2[KYBER_SYMBYTES + 2 + 14];
+                                uint8_t y1, uint8_t y2)
+{
+  unsigned int i;
+  uint8_t extseed1[KYBER_SYMBYTES+2];
+  uint8_t extseed2[KYBER_SYMBYTES+2];
 
-    for (i = 0; i < KYBER_SYMBYTES; i++) {
-        extseed1[i] = seed[i];
-        extseed2[i] = seed[i];
-    }
-    extseed1[KYBER_SYMBYTES  ] = x1;
-    extseed1[KYBER_SYMBYTES + 1] = y1;
+  for(i=0;i<KYBER_SYMBYTES;i++){
+    extseed1[i] = seed[i];
+    extseed2[i] = seed[i];
+  }
+  extseed1[KYBER_SYMBYTES  ] = x1;
+  extseed1[KYBER_SYMBYTES+1] = y1;
 
-    extseed2[KYBER_SYMBYTES  ] = x2;
-    extseed2[KYBER_SYMBYTES + 1] = y2;
+  extseed2[KYBER_SYMBYTES  ] = x2;
+  extseed2[KYBER_SYMBYTES+1] = y2;
 
-    shake128x2_absorb(state, extseed1, extseed2, KYBER_SYMBYTES + 2);
+  shake128x2_absorb(state, extseed1, extseed2, sizeof(extseed1));
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.c b/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.c
index 1216c2c..7f28d9a 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.c
@@ -1,8 +1,35 @@
-#include "NTT_params.h"
-#include "ntt.h"
-#include "params.h"
-#include "reduce.h"
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
+#include "params.h"
+#include "ntt.h"
+#include "reduce.h"
+#include "NTT_params.h"
 
 /*************************************************
 * Name:        ntt
@@ -27,7 +54,8 @@
 * Arguments:   - int16_t r[256] in {-(q-1)/2,...,(q-1)/2}
 *              pointer to input/output vector of elements of Zq
 **************************************************/
-void invntt(int16_t r[256]) {
-    iNTT(r);
+void invntt(int16_t r[256])
+{
+  iNTT(r);
 
 }
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.h b/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.h
index a3e46d0..b67126b 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/ntt.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_H
 #define NTT_H
 #include "params.h"
@@ -9,53 +36,55 @@
 
 #define ntt KYBER_NAMESPACE(ntt)
 void ntt(int16_t r[256]);
-
 #define invntt KYBER_NAMESPACE(invntt)
 void invntt(int16_t r[256]);
 
 
-extern void PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(int16_t*, const int16_t*, const int16_t*);
 
-extern void PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot(int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot(int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top(int16_t*, const int16_t*, const int16_t*);
 
-extern void PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended(int16_t *, const int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *);
-extern void PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended(int16_t*, const int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul(const int16_t*, const int16_t*, const int16_t*, const int16_t*, int16_t*);
+extern void PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery(const int16_t*, const int16_t*, const int16_t*, const int16_t*, int16_t*);
 
 static const int16_t asymmetric_const[16] = {
-    Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime
+Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime
 };
 
 #define NTT(in) { \
-        PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
-        PQCLEAN_KYBER1024_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
     }
 
 #define iNTT(in) { \
-        PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
-        PQCLEAN_KYBER1024_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+	PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+	PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
     }
 
 static const int16_t constants[16] = {
-    Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1,
-    invNQ1_R3modQ1_prime_half,
-    invNQ1_R3modQ1_doubleprime,
-    invNQ1_final_R3modQ1_prime_half,
-    invNQ1_final_R3modQ1_doubleprime
+Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1,
+invNQ1_R3modQ1_prime_half,
+invNQ1_R3modQ1_doubleprime,
+invNQ1_final_R3modQ1_prime_half,
+invNQ1_final_R3modQ1_doubleprime
 };
 
-static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = {
-    0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0
+static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] =
+{
+0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0
 };
 
-static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = {
-    167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175
+static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] =
+{
+167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175
 };
 
-static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = {
-    0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0
+static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] =
+{
+0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0
 };
 
 #endif
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/params.h b/src/kem/kyber/pqclean_kyber1024_aarch64/params.h
index 90bd99f..d7bc76e 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/params.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/params.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef PARAMS_H
 #define PARAMS_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/poly.c b/src/kem/kyber/pqclean_kyber1024_aarch64/poly.c
index 7e55351..1dfa52c 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/poly.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/poly.c
@@ -1,10 +1,43 @@
-#include "cbd.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/blob/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
 #include "poly.h"
+#include "ntt.h"
 #include "reduce.h"
+#include "cbd.h"
 #include "symmetric.h"
-#include <arm_neon.h>
 
 /*************************************************
 * Name:        poly_compress
@@ -152,6 +185,9 @@
     unsigned int i, j;
     int16_t mask;
 
+    #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+    #endif
 
     for (i = 0; i < KYBER_N / 8; i++) {
         for (j = 0; j < 8; j++) {
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/poly.h b/src/kem/kyber/pqclean_kyber1024_aarch64/poly.h
index 51657a6..4caf07d 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/poly.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/poly.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef POLY_H
 #define POLY_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.c b/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.c
index 3011244..d400348 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.c
@@ -1,7 +1,14 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+#include <stdint.h>
 #include "params.h"
 #include "poly.h"
 #include "polyvec.h"
-#include <stdint.h>
 
 /*************************************************
 * Name:        polyvec_compress
@@ -15,6 +22,7 @@
 void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) {
     unsigned int i, j, k;
 
+    #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
     uint16_t t[8];
     for (i = 0; i < KYBER_K; i++) {
         for (j = 0; j < KYBER_N / 8; j++) {
@@ -38,6 +46,27 @@
             r += 11;
         }
     }
+    #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+    for (i = 0; i < KYBER_K; i++) {
+        for (j = 0; j < KYBER_N / 4; j++) {
+            for (k = 0; k < 4; k++) {
+                t[k]  = a[i][4 * j + k];
+                t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+                t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff;
+            }
+
+            r[0] = (t[0] >> 0);
+            r[1] = (t[0] >> 8) | (t[1] << 2);
+            r[2] = (t[1] >> 6) | (t[2] << 4);
+            r[3] = (t[2] >> 4) | (t[3] << 6);
+            r[4] = (t[3] >> 2);
+            r += 5;
+        }
+    }
+    #else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+    #endif
 }
 
 /*************************************************
@@ -53,6 +82,7 @@
 void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
     unsigned int i, j, k;
 
+    #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
     uint16_t t[8];
     for (i = 0; i < KYBER_K; i++) {
         for (j = 0; j < KYBER_N / 8; j++) {
@@ -71,6 +101,24 @@
             }
         }
     }
+    #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+    for (i = 0; i < KYBER_K; i++) {
+        for (j = 0; j < KYBER_N / 4; j++) {
+            t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
+            t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
+            t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
+            t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
+            a += 5;
+
+            for (k = 0; k < 4; k++) {
+                r[i][4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10;
+            }
+        }
+    }
+    #else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+    #endif
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.h b/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.h
index 560f267..04a2c5c 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/polyvec.h
@@ -1,3 +1,37 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef POLYVEC_H
 #define POLYVEC_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.c b/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.c
index ec3328c..7143512 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.c
@@ -1,6 +1,13 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+#include <stdint.h>
 #include "params.h"
 #include "reduce.h"
-#include <stdint.h>
 
 /*************************************************
 * Name:        montgomery_reduce
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.h b/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.h
index 9a35638..c443afb 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/reduce.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef REDUCE_H
 #define REDUCE_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.c b/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.c
index 05a1990..d694ab8 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.c
@@ -1,7 +1,15 @@
-#include "params.h"
-#include "rejsample.h"
-#include "symmetric.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #include <arm_neon.h>
+#include "params.h"
+#include "symmetric.h"
+#include "rejsample.h"
 
 // Define NEON operation
 // Load 8x16
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.h b/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.h
index 8a94a4d..540c3a0 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/rejsample.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef REJSAMPLE_H
 #define REJSAMPLE_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric-shake.c b/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric-shake.c
index 9311d5d..e7e7e87 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric-shake.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric-shake.c
@@ -1,9 +1,16 @@
-#include "fips202.h"
-#include "params.h"
-#include "symmetric.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include "params.h"
+#include "symmetric.h"
+#include "fips202.h"
 
 /*************************************************
 * Name:        kyber_shake128_absorb
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric.h b/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric.h
index ac0a783..12f6a5c 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/symmetric.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef SYMMETRIC_H
 #define SYMMETRIC_H
 
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/verify.c b/src/kem/kyber/pqclean_kyber1024_aarch64/verify.c
index 5d53c66..ca30408 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/verify.c
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/verify.c
@@ -1,6 +1,13 @@
-#include "verify.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "verify.h"
 
 /*************************************************
 * Name:        verify
diff --git a/src/kem/kyber/pqclean_kyber1024_aarch64/verify.h b/src/kem/kyber/pqclean_kyber1024_aarch64/verify.h
index 521f861..18ae986 100644
--- a/src/kem/kyber/pqclean_kyber1024_aarch64/verify.h
+++ b/src/kem/kyber/pqclean_kyber1024_aarch64/verify.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef VERIFY_H
 #define VERIFY_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/Makefile b/src/kem/kyber/pqclean_kyber512_aarch64/Makefile
deleted file mode 100644
index 86fa51e..0000000
--- a/src/kem/kyber/pqclean_kyber512_aarch64/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-# This Makefile can be used with GNU Make or BSD Make
-
-LIB=libkyber512_aarch64.a
-HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h 
-OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o 
-
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
-
-all: $(LIB)
-
-%.o: %.c $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-%.o: %.S $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-$(LIB): $(OBJECTS)
-	$(AR) -r $@ $(OBJECTS)
-
-clean:
-	$(RM) $(OBJECTS)
-	$(RM) $(LIB)
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/NTT_params.h b/src/kem/kyber/pqclean_kyber512_aarch64/NTT_params.h
index 49edeb9..77dae1f 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/NTT_params.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/NTT_params.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_PARAMS_H
 #define NTT_PARAMS_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_NTT.S b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_NTT.S
index ada533e..47b75ef 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_NTT.S
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_NTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top
-.global _PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top:
-_PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top:
+.global PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top
+.global _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top
+PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top:
+_PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -171,13 +194,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot
-.global _PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot:
-_PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot:
+.global PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot
+.global _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot
+PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot:
+_PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_base_mul.S b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_base_mul.S
index a3d39f1..1c34656 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_base_mul.S
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_base_mul.S
@@ -1,16 +1,39 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 #include "params.h"
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended
-.global _PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended
-#if !defined(__clang__) && !defined(old_gas_syntax)
-  .type PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended:
-_PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended:
+.global PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended
+.global _PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended
+PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended:
+_PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended:
 
     push_all
     Q         .req w20
@@ -71,13 +94,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul
-.global _PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul:
-_PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul:
+.global PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul
+.global _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul
+PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul:
+_PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul:
 
     push_all
     Q         .req w28
@@ -226,13 +246,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery
-.global _PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery:
-_PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery:
+.global PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery
+.global _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery
+PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery:
+_PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery:
 
     push_all
     Q         .req w28
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_iNTT.S b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_iNTT.S
index 4a135e7..7acb200 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_iNTT.S
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_iNTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot
-.global _PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot:
-_PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot:
+.global PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot
+.global _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot
+PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot:
+_PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot:
 
     push_all
     Q         .req w20
@@ -88,13 +111,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top
-.global _PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top:
-_PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top:
+.global PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top
+.global _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top
+PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top:
+_PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top:
 
     push_all
     Q         .req w20
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_poly.S b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_poly.S
index fa77b95..9d6de40 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/__asm_poly.S
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/__asm_poly.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_add_reduce
-.global _PQCLEAN_KYBER512_AARCH64_asm_add_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_add_reduce, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_add_reduce:
-_PQCLEAN_KYBER512_AARCH64_asm_add_reduce:
+.global PQCLEAN_KYBER512_AARCH64__asm_add_reduce
+.global _PQCLEAN_KYBER512_AARCH64__asm_add_reduce
+PQCLEAN_KYBER512_AARCH64__asm_add_reduce:
+_PQCLEAN_KYBER512_AARCH64__asm_add_reduce:
 
     mov w4, #3329
     mov w5, #25519
@@ -66,13 +89,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_sub_reduce
-.global _PQCLEAN_KYBER512_AARCH64_asm_sub_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_sub_reduce, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_sub_reduce:
-_PQCLEAN_KYBER512_AARCH64_asm_sub_reduce:
+.global PQCLEAN_KYBER512_AARCH64__asm_sub_reduce
+.global _PQCLEAN_KYBER512_AARCH64__asm_sub_reduce
+PQCLEAN_KYBER512_AARCH64__asm_sub_reduce:
+_PQCLEAN_KYBER512_AARCH64__asm_sub_reduce:
 
     mov w4, #3329
     mov w5, #25519
@@ -130,13 +150,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce
-.global _PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce, %function
-#endif
-PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce:
-_PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce:
+.global PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce
+.global _PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce
+PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce:
+_PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce:
 
     mov w4, #3329
     mov w5, #25519
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/api.h b/src/kem/kyber/pqclean_kyber512_aarch64/api.h
index 60dcbbd..542cc2d 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/api.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/api.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+
 #ifndef PQCLEAN_KYBER512_AARCH64_API_H
 #define PQCLEAN_KYBER512_AARCH64_API_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/cbd.c b/src/kem/kyber/pqclean_kyber512_aarch64/cbd.c
index 9327001..e599b50 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/cbd.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/cbd.c
@@ -1,7 +1,15 @@
-#include "cbd.h"
-#include "params.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #include <arm_neon.h>
 #include <stdint.h>
+#include "params.h"
+#include "cbd.h"
 
 #define vload2(c, ptr) c = vld2q_u8(ptr);
 
@@ -23,7 +31,8 @@
 #define vsublh8(c, a, b) c = (int16x8_t)vsubl_high_u8(a, b);
 
 static
-void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) {
+void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4])
+{
     uint8x16x2_t t, d;      // 4
     uint8x16x2_t a, b;      // 4
     int16x8x4_t res1, res2; // 4
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/cbd.h b/src/kem/kyber/pqclean_kyber512_aarch64/cbd.h
index e1d2fb5..8a1cee5 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/cbd.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/cbd.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef CBD_H
 #define CBD_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/feat.S b/src/kem/kyber/pqclean_kyber512_aarch64/feat.S
new file mode 100644
index 0000000..c214d6f
--- /dev/null
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/feat.S
@@ -0,0 +1,168 @@
+
+/*
+MIT License
+
+Copyright (c) 2020 Bas Westerbaan
+Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3)
+
+.macro round
+    ; Execute theta, but without xoring into the state yet.
+    ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
+    eor3.16b v25, v0, v5, v10
+    eor3.16b v26, v1, v6, v11
+    eor3.16b v27, v2, v7, v12
+    eor3.16b v28, v3, v8, v13
+    eor3.16b v29, v4, v9, v14
+
+    eor3.16b v25, v25, v15, v20
+    eor3.16b v26, v26, v16, v21
+    eor3.16b v27, v27, v17, v22
+    eor3.16b v28, v28, v18, v23
+    eor3.16b v29, v29, v19, v24
+
+    rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4]
+    rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2]
+    rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0]
+    rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3]
+    rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1]
+
+    ; Xor parities from step theta into the state at the same time
+    ; as executing rho and pi.
+    eor.16b v0, v0,  v30
+    mov.16b v31, v1
+    xar.2d v1,  v6,  v27, 20
+    xar.2d v6,  v9,  v25, 44
+    xar.2d v9,  v22, v28, 3
+    xar.2d v22, v14, v25, 25
+    xar.2d v14, v20, v30, 46
+    xar.2d v20, v2,  v28, 2
+    xar.2d v2,  v12, v28, 21
+    xar.2d v12, v13, v29, 39
+    xar.2d v13, v19, v25, 56
+    xar.2d v19, v23, v29, 8
+    xar.2d v23, v15, v30, 23
+    xar.2d v15, v4,  v25, 37
+    xar.2d v4,  v24, v25, 50
+    xar.2d v24, v21, v27, 62
+    xar.2d v21, v8,  v29, 9
+    xar.2d v8,  v16, v27, 19
+    xar.2d v16, v5,  v30, 28
+    xar.2d v5,  v3,  v29, 36
+    xar.2d v3,  v18, v29, 43
+    xar.2d v18, v17, v28, 49
+    xar.2d v17, v11, v27, 54
+    xar.2d v11, v7,  v28, 58
+    xar.2d v7,  v10, v30, 61
+    xar.2d v10, v31, v27, 63
+
+    ; Chi
+    bcax.16b v25, v0,  v2,  v1
+    bcax.16b v26, v1,  v3,  v2
+    bcax.16b v2,  v2,  v4,  v3
+    bcax.16b v3,  v3,  v0,  v4
+    bcax.16b v4,  v4,  v1,  v0
+    mov.16b v0, v25
+    mov.16b v1, v26
+
+    bcax.16b v25, v5,  v7,  v6
+    bcax.16b v26, v6,  v8,  v7
+    bcax.16b v7,  v7,  v9,  v8
+    bcax.16b v8,  v8,  v5,  v9
+    bcax.16b v9,  v9,  v6,  v5
+    mov.16b v5, v25
+    mov.16b v6, v26
+
+    bcax.16b v25, v10,  v12,  v11
+    bcax.16b v26, v11,  v13,  v12
+    bcax.16b v12, v12,  v14,  v13
+    bcax.16b v13, v13,  v10,  v14
+    bcax.16b v14, v14,  v11,  v10
+    mov.16b v10, v25
+    mov.16b v11, v26
+
+    bcax.16b v25, v15,  v17,  v16
+    bcax.16b v26, v16,  v18,  v17
+    bcax.16b v17, v17,  v19,  v18
+    bcax.16b v18, v18,  v15,  v19
+    bcax.16b v19, v19,  v16,  v15
+    mov.16b v15, v25
+    mov.16b v16, v26
+
+    bcax.16b v25, v20,  v22,  v21
+    bcax.16b v26, v21,  v23,  v22
+    bcax.16b v22, v22,  v24,  v23
+    bcax.16b v23, v23,  v20,  v24
+    bcax.16b v24, v24,  v21,  v20
+    mov.16b v20, v25
+    mov.16b v21, v26
+
+    ; iota
+    ld1r {v25.2d}, [x1], #8
+    eor.16b v0, v0, v25
+.endm
+
+.align 4
+.global PQCLEAN_KYBER512_AARCH64_f1600x2
+.global _PQCLEAN_KYBER512_AARCH64_f1600x2
+PQCLEAN_KYBER512_AARCH64_f1600x2:
+_PQCLEAN_KYBER512_AARCH64_f1600x2:
+    stp d8,  d9,  [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+
+    mov x2, x0
+    mov x3, #24
+
+    ld1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    ld1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    ld1.2d {v8,  v9,  v10, v11}, [x0], #64
+    ld1.2d {v12, v13, v14, v15}, [x0], #64
+    ld1.2d {v16, v17, v18, v19}, [x0], #64
+    ld1.2d {v20, v21, v22, v23}, [x0], #64
+    ld1.2d {v24}, [x0]
+
+loop:
+    round
+
+    subs x3, x3, #1
+    cbnz x3, loop
+
+    mov x0, x2
+    st1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    st1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    st1.2d {v8,  v9,  v10, v11}, [x0], #64
+    st1.2d {v12, v13, v14, v15}, [x0], #64
+    st1.2d {v16, v17, v18, v19}, [x0], #64
+    st1.2d {v20, v21, v22, v23}, [x0], #64
+    st1.2d {v24}, [x0]
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8,  d9,  [sp], #16
+
+    ret lr
+
+#endif
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.c b/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.c
index 3924900..3955a96 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.c
@@ -1,6 +1,40 @@
-#include "fips202x2.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
 #include <stddef.h>
+#include "fips202x2.h"
 
 
 #define NROUNDS 24
@@ -67,275 +101,282 @@
 *
 * Arguments:   - uint64_t *state: pointer to input/output Keccak state
 **************************************************/
+extern void PQCLEAN_KYBER512_AARCH64_f1600x2(v128*, const uint64_t*);
 static inline
-void KeccakF1600_StatePermutex2(v128 state[25]) {
-    v128 Aba, Abe, Abi, Abo, Abu;
-    v128 Aga, Age, Agi, Ago, Agu;
-    v128 Aka, Ake, Aki, Ako, Aku;
-    v128 Ama, Ame, Ami, Amo, Amu;
-    v128 Asa, Ase, Asi, Aso, Asu;
-    v128 BCa, BCe, BCi, BCo, BCu; // tmp
-    v128 Da, De, Di, Do, Du;      // D
-    v128 Eba, Ebe, Ebi, Ebo, Ebu;
-    v128 Ega, Ege, Egi, Ego, Egu;
-    v128 Eka, Eke, Eki, Eko, Eku;
-    v128 Ema, Eme, Emi, Emo, Emu;
-    v128 Esa, Ese, Esi, Eso, Esu;
+void KeccakF1600_StatePermutex2(v128 state[25])
+{
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+  PQCLEAN_KYBER512_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+#else
+  v128 Aba, Abe, Abi, Abo, Abu;
+  v128 Aga, Age, Agi, Ago, Agu;
+  v128 Aka, Ake, Aki, Ako, Aku;
+  v128 Ama, Ame, Ami, Amo, Amu;
+  v128 Asa, Ase, Asi, Aso, Asu;
+  v128 BCa, BCe, BCi, BCo, BCu; // tmp
+  v128 Da, De, Di, Do, Du;      // D
+  v128 Eba, Ebe, Ebi, Ebo, Ebu;
+  v128 Ega, Ege, Egi, Ego, Egu;
+  v128 Eka, Eke, Eki, Eko, Eku;
+  v128 Ema, Eme, Emi, Emo, Emu;
+  v128 Esa, Ese, Esi, Eso, Esu;
 
-    //copyFromState(A, state)
-    Aba = state[0];
-    Abe = state[1];
-    Abi = state[2];
-    Abo = state[3];
-    Abu = state[4];
-    Aga = state[5];
-    Age = state[6];
-    Agi = state[7];
-    Ago = state[8];
-    Agu = state[9];
-    Aka = state[10];
-    Ake = state[11];
-    Aki = state[12];
-    Ako = state[13];
-    Aku = state[14];
-    Ama = state[15];
-    Ame = state[16];
-    Ami = state[17];
-    Amo = state[18];
-    Amu = state[19];
-    Asa = state[20];
-    Ase = state[21];
-    Asi = state[22];
-    Aso = state[23];
-    Asu = state[24];
+  //copyFromState(A, state)
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
 
-    for (int round = 0; round < NROUNDS; round += 2) {
-        //    prepareTheta
-        vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
-        vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
-        vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
-        vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
-        vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
+  for (int round = 0; round < NROUNDS; round += 2)
+  {
+    //    prepareTheta
+    vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
+    vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
+    vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
+    vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
+    vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Aba, Aba, Da);
-        vxor(Age, Age, De);
-        vROL(BCe, Age, 44);
-        vxor(Aki, Aki, Di);
-        vROL(BCi, Aki, 43);
-        vxor(Amo, Amo, Do);
-        vROL(BCo, Amo, 21);
-        vxor(Asu, Asu, Du);
-        vROL(BCu, Asu, 14);
-        vXNA(Eba, Aba, BCe, BCi);
-        vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
-        vXNA(Ebe, BCe, BCi, BCo);
-        vXNA(Ebi, BCi, BCo, BCu);
-        vXNA(Ebo, BCo, BCu, Aba);
-        vXNA(Ebu, BCu, Aba, BCe);
+    vxor(Aba, Aba, Da);
+    vxor(Age, Age, De);
+    vROL(BCe, Age, 44);
+    vxor(Aki, Aki, Di);
+    vROL(BCi, Aki, 43);
+    vxor(Amo, Amo, Do);
+    vROL(BCo, Amo, 21);
+    vxor(Asu, Asu, Du);
+    vROL(BCu, Asu, 14);
+    vXNA(Eba, Aba, BCe, BCi);
+    vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
+    vXNA(Ebe, BCe, BCi, BCo);
+    vXNA(Ebi, BCi, BCo, BCu);
+    vXNA(Ebo, BCo, BCu, Aba);
+    vXNA(Ebu, BCu, Aba, BCe);
 
-        vxor(Abo, Abo, Do);
-        vROL(BCa, Abo, 28);
-        vxor(Agu, Agu, Du);
-        vROL(BCe, Agu, 20);
-        vxor(Aka, Aka, Da);
-        vROL(BCi, Aka, 3);
-        vxor(Ame, Ame, De);
-        vROL(BCo, Ame, 45);
-        vxor(Asi, Asi, Di);
-        vROL(BCu, Asi, 61);
-        vXNA(Ega, BCa, BCe, BCi);
-        vXNA(Ege, BCe, BCi, BCo);
-        vXNA(Egi, BCi, BCo, BCu);
-        vXNA(Ego, BCo, BCu, BCa);
-        vXNA(Egu, BCu, BCa, BCe);
+    vxor(Abo, Abo, Do);
+    vROL(BCa, Abo, 28);
+    vxor(Agu, Agu, Du);
+    vROL(BCe, Agu, 20);
+    vxor(Aka, Aka, Da);
+    vROL(BCi, Aka, 3);
+    vxor(Ame, Ame, De);
+    vROL(BCo, Ame, 45);
+    vxor(Asi, Asi, Di);
+    vROL(BCu, Asi, 61);
+    vXNA(Ega, BCa, BCe, BCi);
+    vXNA(Ege, BCe, BCi, BCo);
+    vXNA(Egi, BCi, BCo, BCu);
+    vXNA(Ego, BCo, BCu, BCa);
+    vXNA(Egu, BCu, BCa, BCe);
 
-        vxor(Abe, Abe, De);
-        vROL(BCa, Abe, 1);
-        vxor(Agi, Agi, Di);
-        vROL(BCe, Agi, 6);
-        vxor(Ako, Ako, Do);
-        vROL(BCi, Ako, 25);
-        vxor(Amu, Amu, Du);
-        vROL(BCo, Amu, 8);
-        vxor(Asa, Asa, Da);
-        vROL(BCu, Asa, 18);
-        vXNA(Eka, BCa, BCe, BCi);
-        vXNA(Eke, BCe, BCi, BCo);
-        vXNA(Eki, BCi, BCo, BCu);
-        vXNA(Eko, BCo, BCu, BCa);
-        vXNA(Eku, BCu, BCa, BCe);
+    vxor(Abe, Abe, De);
+    vROL(BCa, Abe, 1);
+    vxor(Agi, Agi, Di);
+    vROL(BCe, Agi, 6);
+    vxor(Ako, Ako, Do);
+    vROL(BCi, Ako, 25);
+    vxor(Amu, Amu, Du);
+    vROL(BCo, Amu, 8);
+    vxor(Asa, Asa, Da);
+    vROL(BCu, Asa, 18);
+    vXNA(Eka, BCa, BCe, BCi);
+    vXNA(Eke, BCe, BCi, BCo);
+    vXNA(Eki, BCi, BCo, BCu);
+    vXNA(Eko, BCo, BCu, BCa);
+    vXNA(Eku, BCu, BCa, BCe);
 
-        vxor(Abu, Abu, Du);
-        vROL(BCa, Abu, 27);
-        vxor(Aga, Aga, Da);
-        vROL(BCe, Aga, 36);
-        vxor(Ake, Ake, De);
-        vROL(BCi, Ake, 10);
-        vxor(Ami, Ami, Di);
-        vROL(BCo, Ami, 15);
-        vxor(Aso, Aso, Do);
-        vROL(BCu, Aso, 56);
-        vXNA(Ema, BCa, BCe, BCi);
-        vXNA(Eme, BCe, BCi, BCo);
-        vXNA(Emi, BCi, BCo, BCu);
-        vXNA(Emo, BCo, BCu, BCa);
-        vXNA(Emu, BCu, BCa, BCe);
+    vxor(Abu, Abu, Du);
+    vROL(BCa, Abu, 27);
+    vxor(Aga, Aga, Da);
+    vROL(BCe, Aga, 36);
+    vxor(Ake, Ake, De);
+    vROL(BCi, Ake, 10);
+    vxor(Ami, Ami, Di);
+    vROL(BCo, Ami, 15);
+    vxor(Aso, Aso, Do);
+    vROL(BCu, Aso, 56);
+    vXNA(Ema, BCa, BCe, BCi);
+    vXNA(Eme, BCe, BCi, BCo);
+    vXNA(Emi, BCi, BCo, BCu);
+    vXNA(Emo, BCo, BCu, BCa);
+    vXNA(Emu, BCu, BCa, BCe);
 
-        vxor(Abi, Abi, Di);
-        vROL(BCa, Abi, 62);
-        vxor(Ago, Ago, Do);
-        vROL(BCe, Ago, 55);
-        vxor(Aku, Aku, Du);
-        vROL(BCi, Aku, 39);
-        vxor(Ama, Ama, Da);
-        vROL(BCo, Ama, 41);
-        vxor(Ase, Ase, De);
-        vROL(BCu, Ase, 2);
-        vXNA(Esa, BCa, BCe, BCi);
-        vXNA(Ese, BCe, BCi, BCo);
-        vXNA(Esi, BCi, BCo, BCu);
-        vXNA(Eso, BCo, BCu, BCa);
-        vXNA(Esu, BCu, BCa, BCe);
+    vxor(Abi, Abi, Di);
+    vROL(BCa, Abi, 62);
+    vxor(Ago, Ago, Do);
+    vROL(BCe, Ago, 55);
+    vxor(Aku, Aku, Du);
+    vROL(BCi, Aku, 39);
+    vxor(Ama, Ama, Da);
+    vROL(BCo, Ama, 41);
+    vxor(Ase, Ase, De);
+    vROL(BCu, Ase, 2);
+    vXNA(Esa, BCa, BCe, BCi);
+    vXNA(Ese, BCe, BCi, BCo);
+    vXNA(Esi, BCi, BCo, BCu);
+    vXNA(Eso, BCo, BCu, BCa);
+    vXNA(Esu, BCu, BCa, BCe);
 
-        // Next Round
+    // Next Round
 
-        //    prepareTheta
-        vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
-        vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
-        vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
-        vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
-        vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
+    //    prepareTheta
+    vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
+    vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
+    vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
+    vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
+    vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Eba, Eba, Da);
-        vxor(Ege, Ege, De);
-        vROL(BCe, Ege, 44);
-        vxor(Eki, Eki, Di);
-        vROL(BCi, Eki, 43);
-        vxor(Emo, Emo, Do);
-        vROL(BCo, Emo, 21);
-        vxor(Esu, Esu, Du);
-        vROL(BCu, Esu, 14);
-        vXNA(Aba, Eba, BCe, BCi);
-        vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
-        vXNA(Abe, BCe, BCi, BCo);
-        vXNA(Abi, BCi, BCo, BCu);
-        vXNA(Abo, BCo, BCu, Eba);
-        vXNA(Abu, BCu, Eba, BCe);
+    vxor(Eba, Eba, Da);
+    vxor(Ege, Ege, De);
+    vROL(BCe, Ege, 44);
+    vxor(Eki, Eki, Di);
+    vROL(BCi, Eki, 43);
+    vxor(Emo, Emo, Do);
+    vROL(BCo, Emo, 21);
+    vxor(Esu, Esu, Du);
+    vROL(BCu, Esu, 14);
+    vXNA(Aba, Eba, BCe, BCi);
+    vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
+    vXNA(Abe, BCe, BCi, BCo);
+    vXNA(Abi, BCi, BCo, BCu);
+    vXNA(Abo, BCo, BCu, Eba);
+    vXNA(Abu, BCu, Eba, BCe);
 
-        vxor(Ebo, Ebo, Do);
-        vROL(BCa, Ebo, 28);
-        vxor(Egu, Egu, Du);
-        vROL(BCe, Egu, 20);
-        vxor(Eka, Eka, Da);
-        vROL(BCi, Eka, 3);
-        vxor(Eme, Eme, De);
-        vROL(BCo, Eme, 45);
-        vxor(Esi, Esi, Di);
-        vROL(BCu, Esi, 61);
-        vXNA(Aga, BCa, BCe, BCi);
-        vXNA(Age, BCe, BCi, BCo);
-        vXNA(Agi, BCi, BCo, BCu);
-        vXNA(Ago, BCo, BCu, BCa);
-        vXNA(Agu, BCu, BCa, BCe);
+    vxor(Ebo, Ebo, Do);
+    vROL(BCa, Ebo, 28);
+    vxor(Egu, Egu, Du);
+    vROL(BCe, Egu, 20);
+    vxor(Eka, Eka, Da);
+    vROL(BCi, Eka, 3);
+    vxor(Eme, Eme, De);
+    vROL(BCo, Eme, 45);
+    vxor(Esi, Esi, Di);
+    vROL(BCu, Esi, 61);
+    vXNA(Aga, BCa, BCe, BCi);
+    vXNA(Age, BCe, BCi, BCo);
+    vXNA(Agi, BCi, BCo, BCu);
+    vXNA(Ago, BCo, BCu, BCa);
+    vXNA(Agu, BCu, BCa, BCe);
 
-        vxor(Ebe, Ebe, De);
-        vROL(BCa, Ebe, 1);
-        vxor(Egi, Egi, Di);
-        vROL(BCe, Egi, 6);
-        vxor(Eko, Eko, Do);
-        vROL(BCi, Eko, 25);
-        vxor(Emu, Emu, Du);
-        vROL(BCo, Emu, 8);
-        vxor(Esa, Esa, Da);
-        vROL(BCu, Esa, 18);
-        vXNA(Aka, BCa, BCe, BCi);
-        vXNA(Ake, BCe, BCi, BCo);
-        vXNA(Aki, BCi, BCo, BCu);
-        vXNA(Ako, BCo, BCu, BCa);
-        vXNA(Aku, BCu, BCa, BCe);
+    vxor(Ebe, Ebe, De);
+    vROL(BCa, Ebe, 1);
+    vxor(Egi, Egi, Di);
+    vROL(BCe, Egi, 6);
+    vxor(Eko, Eko, Do);
+    vROL(BCi, Eko, 25);
+    vxor(Emu, Emu, Du);
+    vROL(BCo, Emu, 8);
+    vxor(Esa, Esa, Da);
+    vROL(BCu, Esa, 18);
+    vXNA(Aka, BCa, BCe, BCi);
+    vXNA(Ake, BCe, BCi, BCo);
+    vXNA(Aki, BCi, BCo, BCu);
+    vXNA(Ako, BCo, BCu, BCa);
+    vXNA(Aku, BCu, BCa, BCe);
 
-        vxor(Ebu, Ebu, Du);
-        vROL(BCa, Ebu, 27);
-        vxor(Ega, Ega, Da);
-        vROL(BCe, Ega, 36);
-        vxor(Eke, Eke, De);
-        vROL(BCi, Eke, 10);
-        vxor(Emi, Emi, Di);
-        vROL(BCo, Emi, 15);
-        vxor(Eso, Eso, Do);
-        vROL(BCu, Eso, 56);
-        vXNA(Ama, BCa, BCe, BCi);
-        vXNA(Ame, BCe, BCi, BCo);
-        vXNA(Ami, BCi, BCo, BCu);
-        vXNA(Amo, BCo, BCu, BCa);
-        vXNA(Amu, BCu, BCa, BCe);
+    vxor(Ebu, Ebu, Du);
+    vROL(BCa, Ebu, 27);
+    vxor(Ega, Ega, Da);
+    vROL(BCe, Ega, 36);
+    vxor(Eke, Eke, De);
+    vROL(BCi, Eke, 10);
+    vxor(Emi, Emi, Di);
+    vROL(BCo, Emi, 15);
+    vxor(Eso, Eso, Do);
+    vROL(BCu, Eso, 56);
+    vXNA(Ama, BCa, BCe, BCi);
+    vXNA(Ame, BCe, BCi, BCo);
+    vXNA(Ami, BCi, BCo, BCu);
+    vXNA(Amo, BCo, BCu, BCa);
+    vXNA(Amu, BCu, BCa, BCe);
 
-        vxor(Ebi, Ebi, Di);
-        vROL(BCa, Ebi, 62);
-        vxor(Ego, Ego, Do);
-        vROL(BCe, Ego, 55);
-        vxor(Eku, Eku, Du);
-        vROL(BCi, Eku, 39);
-        vxor(Ema, Ema, Da);
-        vROL(BCo, Ema, 41);
-        vxor(Ese, Ese, De);
-        vROL(BCu, Ese, 2);
-        vXNA(Asa, BCa, BCe, BCi);
-        vXNA(Ase, BCe, BCi, BCo);
-        vXNA(Asi, BCi, BCo, BCu);
-        vXNA(Aso, BCo, BCu, BCa);
-        vXNA(Asu, BCu, BCa, BCe);
-    }
+    vxor(Ebi, Ebi, Di);
+    vROL(BCa, Ebi, 62);
+    vxor(Ego, Ego, Do);
+    vROL(BCe, Ego, 55);
+    vxor(Eku, Eku, Du);
+    vROL(BCi, Eku, 39);
+    vxor(Ema, Ema, Da);
+    vROL(BCo, Ema, 41);
+    vxor(Ese, Ese, De);
+    vROL(BCu, Ese, 2);
+    vXNA(Asa, BCa, BCe, BCi);
+    vXNA(Ase, BCe, BCi, BCo);
+    vXNA(Asi, BCi, BCo, BCu);
+    vXNA(Aso, BCo, BCu, BCa);
+    vXNA(Asu, BCu, BCa, BCe);
+  }
 
-    state[0] = Aba;
-    state[1] = Abe;
-    state[2] = Abi;
-    state[3] = Abo;
-    state[4] = Abu;
-    state[5] = Aga;
-    state[6] = Age;
-    state[7] = Agi;
-    state[8] = Ago;
-    state[9] = Agu;
-    state[10] = Aka;
-    state[11] = Ake;
-    state[12] = Aki;
-    state[13] = Ako;
-    state[14] = Aku;
-    state[15] = Ama;
-    state[16] = Ame;
-    state[17] = Ami;
-    state[18] = Amo;
-    state[19] = Amu;
-    state[20] = Asa;
-    state[21] = Ase;
-    state[22] = Asi;
-    state[23] = Aso;
-    state[24] = Asu;
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+#endif
 }
 
 /*************************************************
@@ -463,39 +504,41 @@
                             uint8_t *out1,
                             size_t nblocks,
                             unsigned int r,
-                            v128 s[25]) {
-    unsigned int i;
+                            v128 s[25]){
+  unsigned int i;
 
-    uint64x1_t a, b;
-    uint64x2x2_t a2, b2;
+  uint64x1_t a, b;
+  uint64x2x2_t a2, b2;
 
-    while (nblocks > 0) {
-        KeccakF1600_StatePermutex2(s);
+  while (nblocks > 0)
+  {
+    KeccakF1600_StatePermutex2(s);
 
-        for (i = 0; i < r / 8 - 1; i += 4) {
-            a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
-            b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
-            a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
-            b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
-            vst1q_u64_x2((uint64_t *)out0, a2);
-            vst1q_u64_x2((uint64_t *)out1, b2);
+    for (i = 0; i < r / 8 - 1; i += 4)
+    {
+      a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
+      b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
+      a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
+      b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
+      vst1q_u64_x2((uint64_t *)out0, a2);
+      vst1q_u64_x2((uint64_t *)out1, b2);
 
-            out0 += 32;
-            out1 += 32;
-        }
-
-        i = r / 8 - 1;
-        // Last iteration
-        a = vget_low_u64(s[i]);
-        b = vget_high_u64(s[i]);
-        vst1_u64((uint64_t *)out0, a);
-        vst1_u64((uint64_t *)out1, b);
-
-        out0 += 8;
-        out1 += 8;
-
-        --nblocks;
+      out0 += 32;
+      out1 += 32;
     }
+
+    i = r / 8 - 1;
+    // Last iteration
+    a = vget_low_u64(s[i]);
+    b = vget_high_u64(s[i]);
+    vst1_u64((uint64_t *)out0, a);
+    vst1_u64((uint64_t *)out1, b);
+
+    out0 += 8;
+    out1 += 8;
+
+    --nblocks;
+  }
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.h b/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.h
index 426988d..a1eacdf 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/fips202x2.h
@@ -1,15 +1,21 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef FIPS202X2_H
 #define FIPS202X2_H
 
 #include "params.h"
 #include <arm_neon.h>
 #include <stddef.h>
-
 #include "fips202.h"
 
 typedef uint64x2_t v128;
 
-
 typedef struct {
     v128 s[25];
 } keccakx2_state;
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.c b/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.c
index 26a655a..26d9e7c 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.c
@@ -1,15 +1,50 @@
-#include "NTT_params.h"
-#include "indcpa.h"
-#include "ntt.h"
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-#include "randombytes.h"
-#include "rejsample.h"
-#include "symmetric.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+#include "params.h"
+#include "rejsample.h"
+#include "indcpa.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+
+#include "NTT_params.h"
+#include "ntt.h"
 
 /*************************************************
 * Name:        pack_pk
@@ -195,11 +230,11 @@
     neon_polyvec_ntt(e);
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended(&(skpv_asymmetric[i][0]), &(skpv[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+        PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended(&(skpv_asymmetric[i][0]), &(skpv[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery(&(a[i][0][0]), &(skpv[0][0]), &(skpv_asymmetric[0][0]), asymmetric_const, pkpv[i]);
+        PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery(&(a[i][0][0]), &(skpv[0][0]), &(skpv_asymmetric[0][0]), asymmetric_const, pkpv[i]);
     }
 
     neon_polyvec_add_reduce(pkpv, e);
@@ -245,22 +280,22 @@
     poly_frommsg(k, m);
     gen_at(at, seed);
 
-    // ETA1 != ETA2 (3 != 2)
-    neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
-    neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3);
-    neon_poly_getnoise_eta2(&(epp[0]), coins, 4);
+  // ETA1 != ETA2 (3 != 2)
+  neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
+  neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3);
+  neon_poly_getnoise_eta2(&(epp[0]), coins, 4);
 
     neon_polyvec_ntt(sp);
 
-    for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended(&(sp_asymmetric[i][0]), &(sp[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+    for(i = 0; i < KYBER_K; i++){
+        PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended(&(sp_asymmetric[i][0]), &(sp[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
-    for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul(&(at[i][0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, b[i]);
+    for(i = 0; i < KYBER_K; i++){
+        PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul(&(at[i][0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, b[i]);
     }
 
-    PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul(&(pkpv[0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, v);
+    PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul(&(pkpv[0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, v);
 
     neon_polyvec_invntt_to_mont(b);
     invntt(v);
@@ -302,10 +337,10 @@
     neon_polyvec_ntt(b);
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended(&(b_asymmetric[i][0]), &(b[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+        PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended(&(b_asymmetric[i][0]), &(b[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
-    PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul(&(skpv[0][0]), &(b[0][0]), &(b_asymmetric[0][0]), asymmetric_const, mp);
+    PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul(&(skpv[0][0]), &(b[0][0]), &(b_asymmetric[0][0]), asymmetric_const, mp);
 
     invntt(mp);
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.h b/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.h
index f718f39..b74bc0b 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/indcpa.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef INDCPA_H
 #define INDCPA_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/kem.c b/src/kem/kyber/pqclean_kyber512_aarch64/kem.c
index bcd93a8..1b3d063 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/kem.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/kem.c
@@ -1,11 +1,18 @@
-#include "indcpa.h"
-#include "kem.h"
-#include "params.h"
-#include "randombytes.h"
-#include "symmetric.h"
-#include "verify.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "params.h"
+#include "indcpa.h"
+#include "verify.h"
+#include "symmetric.h"
+#include "randombytes.h"
+#include "kem.h"
 
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/kem.h b/src/kem/kyber/pqclean_kyber512_aarch64/kem.h
index 9b72b3e..205e96a 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/kem.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/kem.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef KEM_H
 #define KEM_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/macros.inc b/src/kem/kyber/pqclean_kyber512_aarch64/macros.inc
index 9e392b0..2add309 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/macros.inc
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/macros.inc
@@ -1,4 +1,30 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef MACROS_S
 #define MACROS_S
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/macros_common.inc b/src/kem/kyber/pqclean_kyber512_aarch64/macros_common.inc
index 26e7cbb..c1ac021 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/macros_common.inc
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/macros_common.inc
@@ -1,4 +1,30 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef MACROS_COMMON
 #define MACROS_COMMON
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/neon_poly.c b/src/kem/kyber/pqclean_kyber512_aarch64/neon_poly.c
index 0c93632..fdb37f9 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/neon_poly.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/neon_poly.c
@@ -1,9 +1,43 @@
-#include "cbd.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
 #include "poly.h"
+#include "ntt.h"
+#include "cbd.h"
 #include "symmetric.h"
-#include <arm_neon.h>
 
 
 /*************************************************
@@ -97,14 +131,14 @@
 *            - const poly *a: pointer to first input polynomial
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_KYBER512_AARCH64_asm_add_reduce(int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER512_AARCH64__asm_add_reduce(int16_t *, const int16_t *);
 void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
-    PQCLEAN_KYBER512_AARCH64_asm_add_reduce(c, a);
+    PQCLEAN_KYBER512_AARCH64__asm_add_reduce(c, a);
 }
 
-extern void PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
 void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) {
-    PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(c, a, b);
+    PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(c, a, b);
 }
 
 /*************************************************
@@ -118,7 +152,7 @@
 *            - const poly *a: pointer to first input polynomial
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(int16_t *, const int16_t *);
 void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
-    PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(c, a);
+    PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(c, a);
 }
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/neon_polyvec.c b/src/kem/kyber/pqclean_kyber512_aarch64/neon_polyvec.c
index 1af48ea..c05f59d 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/neon_polyvec.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/neon_polyvec.c
@@ -1,10 +1,45 @@
-#include "NTT_params.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
+#include "reduce.h"
+#include "ntt.h"
 #include "poly.h"
 #include "polyvec.h"
-#include "reduce.h"
-#include <arm_neon.h>
+
+#include "NTT_params.h"
 
 #define _V (((1U << 26) + KYBER_Q / 2) / KYBER_Q)
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/neon_symmetric-shake.c b/src/kem/kyber/pqclean_kyber512_aarch64/neon_symmetric-shake.c
index 6515250..8aced5e 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/neon_symmetric-shake.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/neon_symmetric-shake.c
@@ -1,8 +1,42 @@
-#include "fips202x2.h"
-#include "params.h"
-#include "symmetric.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "params.h"
+#include "fips202x2.h"
+#include "symmetric.h"
 
 /*************************************************
 * Name:        kyber_shake128_absorb
@@ -19,22 +53,23 @@
 void neon_kyber_shake128_absorb(keccakx2_state *state,
                                 const uint8_t seed[KYBER_SYMBYTES],
                                 uint8_t x1, uint8_t x2,
-                                uint8_t y1, uint8_t y2) {
-    unsigned int i;
-    uint8_t extseed1[KYBER_SYMBYTES + 2 + 14];
-    uint8_t extseed2[KYBER_SYMBYTES + 2 + 14];
+                                uint8_t y1, uint8_t y2)
+{
+  unsigned int i;
+  uint8_t extseed1[KYBER_SYMBYTES+2];
+  uint8_t extseed2[KYBER_SYMBYTES+2];
 
-    for (i = 0; i < KYBER_SYMBYTES; i++) {
-        extseed1[i] = seed[i];
-        extseed2[i] = seed[i];
-    }
-    extseed1[KYBER_SYMBYTES  ] = x1;
-    extseed1[KYBER_SYMBYTES + 1] = y1;
+  for(i=0;i<KYBER_SYMBYTES;i++){
+    extseed1[i] = seed[i];
+    extseed2[i] = seed[i];
+  }
+  extseed1[KYBER_SYMBYTES  ] = x1;
+  extseed1[KYBER_SYMBYTES+1] = y1;
 
-    extseed2[KYBER_SYMBYTES  ] = x2;
-    extseed2[KYBER_SYMBYTES + 1] = y2;
+  extseed2[KYBER_SYMBYTES  ] = x2;
+  extseed2[KYBER_SYMBYTES+1] = y2;
 
-    shake128x2_absorb(state, extseed1, extseed2, KYBER_SYMBYTES + 2);
+  shake128x2_absorb(state, extseed1, extseed2, sizeof(extseed1));
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/ntt.c b/src/kem/kyber/pqclean_kyber512_aarch64/ntt.c
index 1216c2c..7f28d9a 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/ntt.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/ntt.c
@@ -1,8 +1,35 @@
-#include "NTT_params.h"
-#include "ntt.h"
-#include "params.h"
-#include "reduce.h"
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
+#include "params.h"
+#include "ntt.h"
+#include "reduce.h"
+#include "NTT_params.h"
 
 /*************************************************
 * Name:        ntt
@@ -27,7 +54,8 @@
 * Arguments:   - int16_t r[256] in {-(q-1)/2,...,(q-1)/2}
 *              pointer to input/output vector of elements of Zq
 **************************************************/
-void invntt(int16_t r[256]) {
-    iNTT(r);
+void invntt(int16_t r[256])
+{
+  iNTT(r);
 
 }
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/ntt.h b/src/kem/kyber/pqclean_kyber512_aarch64/ntt.h
index 0f7574d..1b70535 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/ntt.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/ntt.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_H
 #define NTT_H
 #include "params.h"
@@ -9,53 +36,55 @@
 
 #define ntt KYBER_NAMESPACE(ntt)
 void ntt(int16_t r[256]);
-
 #define invntt KYBER_NAMESPACE(invntt)
 void invntt(int16_t r[256]);
 
 
-extern void PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(int16_t*, const int16_t*, const int16_t*);
 
-extern void PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot(int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot(int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top(int16_t*, const int16_t*, const int16_t*);
 
-extern void PQCLEAN_KYBER512_AARCH64_asm_point_mul_extended(int16_t *, const int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *);
-extern void PQCLEAN_KYBER512_AARCH64_asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *);
+extern void PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended(int16_t*, const int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul(const int16_t*, const int16_t*, const int16_t*, const int16_t*, int16_t*);
+extern void PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery(const int16_t*, const int16_t*, const int16_t*, const int16_t*, int16_t*);
 
 static const int16_t asymmetric_const[16] = {
-    Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime
+Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime
 };
 
 #define NTT(in) { \
-        PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
-        PQCLEAN_KYBER512_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
     }
 
 #define iNTT(in) { \
-        PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
-        PQCLEAN_KYBER512_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+	PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+	PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
     }
 
 static const int16_t constants[16] = {
-    Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1,
-    invNQ1_R3modQ1_prime_half,
-    invNQ1_R3modQ1_doubleprime,
-    invNQ1_final_R3modQ1_prime_half,
-    invNQ1_final_R3modQ1_doubleprime
+Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1,
+invNQ1_R3modQ1_prime_half,
+invNQ1_R3modQ1_doubleprime,
+invNQ1_final_R3modQ1_prime_half,
+invNQ1_final_R3modQ1_doubleprime
 };
 
-static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = {
-    0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0
+static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] =
+{
+0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0
 };
 
-static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = {
-    167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175
+static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] =
+{
+167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175
 };
 
-static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = {
-    0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0
+static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] =
+{
+0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0
 };
 
 #endif
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/params.h b/src/kem/kyber/pqclean_kyber512_aarch64/params.h
index 233e835..541c14b 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/params.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/params.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef PARAMS_H
 #define PARAMS_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/poly.c b/src/kem/kyber/pqclean_kyber512_aarch64/poly.c
index 6d1ecdd..dffc655 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/poly.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/poly.c
@@ -1,10 +1,43 @@
-#include "cbd.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/blob/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
 #include "poly.h"
+#include "ntt.h"
 #include "reduce.h"
+#include "cbd.h"
 #include "symmetric.h"
-#include <arm_neon.h>
 
 /*************************************************
 * Name:        poly_compress
@@ -139,6 +172,9 @@
     unsigned int i, j;
     int16_t mask;
 
+    #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+    #endif
 
     for (i = 0; i < KYBER_N / 8; i++) {
         for (j = 0; j < 8; j++) {
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/poly.h b/src/kem/kyber/pqclean_kyber512_aarch64/poly.h
index 51657a6..4caf07d 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/poly.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/poly.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef POLY_H
 #define POLY_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.c b/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.c
index 58dc92a..d400348 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.c
@@ -1,7 +1,14 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+#include <stdint.h>
 #include "params.h"
 #include "poly.h"
 #include "polyvec.h"
-#include <stdint.h>
 
 /*************************************************
 * Name:        polyvec_compress
@@ -15,6 +22,31 @@
 void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) {
     unsigned int i, j, k;
 
+    #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    uint16_t t[8];
+    for (i = 0; i < KYBER_K; i++) {
+        for (j = 0; j < KYBER_N / 8; j++) {
+            for (k = 0; k < 8; k++) {
+                t[k]  = a[i][8 * j + k];
+                t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+                t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff;
+            }
+
+            r[ 0] = (t[0] >>  0);
+            r[ 1] = (t[0] >>  8) | (t[1] << 3);
+            r[ 2] = (t[1] >>  5) | (t[2] << 6);
+            r[ 3] = (t[2] >>  2);
+            r[ 4] = (t[2] >> 10) | (t[3] << 1);
+            r[ 5] = (t[3] >>  7) | (t[4] << 4);
+            r[ 6] = (t[4] >>  4) | (t[5] << 7);
+            r[ 7] = (t[5] >>  1);
+            r[ 8] = (t[5] >>  9) | (t[6] << 2);
+            r[ 9] = (t[6] >>  6) | (t[7] << 5);
+            r[10] = (t[7] >>  3);
+            r += 11;
+        }
+    }
+    #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
     uint16_t t[4];
     for (i = 0; i < KYBER_K; i++) {
         for (j = 0; j < KYBER_N / 4; j++) {
@@ -32,6 +64,9 @@
             r += 5;
         }
     }
+    #else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+    #endif
 }
 
 /*************************************************
@@ -47,6 +82,26 @@
 void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
     unsigned int i, j, k;
 
+    #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    uint16_t t[8];
+    for (i = 0; i < KYBER_K; i++) {
+        for (j = 0; j < KYBER_N / 8; j++) {
+            t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
+            t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
+            t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
+            t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
+            t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
+            t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
+            t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
+            t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
+            a += 11;
+
+            for (k = 0; k < 8; k++) {
+                r[i][8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11;
+            }
+        }
+    }
+    #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
     uint16_t t[4];
     for (i = 0; i < KYBER_K; i++) {
         for (j = 0; j < KYBER_N / 4; j++) {
@@ -61,6 +116,9 @@
             }
         }
     }
+    #else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+    #endif
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.h b/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.h
index 560f267..04a2c5c 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/polyvec.h
@@ -1,3 +1,37 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef POLYVEC_H
 #define POLYVEC_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/reduce.c b/src/kem/kyber/pqclean_kyber512_aarch64/reduce.c
index ec3328c..7143512 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/reduce.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/reduce.c
@@ -1,6 +1,13 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+#include <stdint.h>
 #include "params.h"
 #include "reduce.h"
-#include <stdint.h>
 
 /*************************************************
 * Name:        montgomery_reduce
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/reduce.h b/src/kem/kyber/pqclean_kyber512_aarch64/reduce.h
index 9a35638..c443afb 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/reduce.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/reduce.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef REDUCE_H
 #define REDUCE_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.c b/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.c
index 05a1990..d694ab8 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.c
@@ -1,7 +1,15 @@
-#include "params.h"
-#include "rejsample.h"
-#include "symmetric.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #include <arm_neon.h>
+#include "params.h"
+#include "symmetric.h"
+#include "rejsample.h"
 
 // Define NEON operation
 // Load 8x16
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.h b/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.h
index 8a94a4d..540c3a0 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/rejsample.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef REJSAMPLE_H
 #define REJSAMPLE_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/symmetric-shake.c b/src/kem/kyber/pqclean_kyber512_aarch64/symmetric-shake.c
index 9311d5d..e7e7e87 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/symmetric-shake.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/symmetric-shake.c
@@ -1,9 +1,16 @@
-#include "fips202.h"
-#include "params.h"
-#include "symmetric.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include "params.h"
+#include "symmetric.h"
+#include "fips202.h"
 
 /*************************************************
 * Name:        kyber_shake128_absorb
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/symmetric.h b/src/kem/kyber/pqclean_kyber512_aarch64/symmetric.h
index ac0a783..12f6a5c 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/symmetric.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/symmetric.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef SYMMETRIC_H
 #define SYMMETRIC_H
 
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/verify.c b/src/kem/kyber/pqclean_kyber512_aarch64/verify.c
index 5d53c66..ca30408 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/verify.c
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/verify.c
@@ -1,6 +1,13 @@
-#include "verify.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "verify.h"
 
 /*************************************************
 * Name:        verify
diff --git a/src/kem/kyber/pqclean_kyber512_aarch64/verify.h b/src/kem/kyber/pqclean_kyber512_aarch64/verify.h
index 521f861..18ae986 100644
--- a/src/kem/kyber/pqclean_kyber512_aarch64/verify.h
+++ b/src/kem/kyber/pqclean_kyber512_aarch64/verify.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef VERIFY_H
 #define VERIFY_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/Makefile b/src/kem/kyber/pqclean_kyber768_aarch64/Makefile
deleted file mode 100644
index 476ef45..0000000
--- a/src/kem/kyber/pqclean_kyber768_aarch64/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-# This Makefile can be used with GNU Make or BSD Make
-
-LIB=libkyber768_aarch64.a
-HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h 
-OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o 
-
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
-
-all: $(LIB)
-
-%.o: %.c $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-%.o: %.S $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-$(LIB): $(OBJECTS)
-	$(AR) -r $@ $(OBJECTS)
-
-clean:
-	$(RM) $(OBJECTS)
-	$(RM) $(LIB)
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/NTT_params.h b/src/kem/kyber/pqclean_kyber768_aarch64/NTT_params.h
index 49edeb9..77dae1f 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/NTT_params.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/NTT_params.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_PARAMS_H
 #define NTT_PARAMS_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_NTT.S b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_NTT.S
index bb2253e..eb76616 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_NTT.S
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_NTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top
-.global _PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top:
-_PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top:
+.global PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top
+.global _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top
+PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top:
+_PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -171,13 +194,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot
-.global _PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot:
-_PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot:
+.global PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot
+.global _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot
+PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot:
+_PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_base_mul.S b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_base_mul.S
index 2bbb228..cc4636a 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_base_mul.S
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_base_mul.S
@@ -1,16 +1,39 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 #include "params.h"
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended
-.global _PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended
-#if !defined(__clang__) && !defined(old_gas_syntax)
-  .type PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended:
-_PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended:
+.global PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended
+.global _PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended
+PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended:
+_PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended:
 
     push_all
     Q         .req w20
@@ -71,13 +94,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul
-.global _PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul:
-_PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul:
+.global PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul
+.global _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul
+PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul:
+_PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul:
 
     push_all
     Q         .req w28
@@ -226,13 +246,10 @@
 
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery
-.global _PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery:
-_PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery:
+.global PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery
+.global _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery
+PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery:
+_PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery:
 
     push_all
     Q         .req w28
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_iNTT.S b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_iNTT.S
index cce9aa7..7ddb592 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_iNTT.S
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_iNTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot
-.global _PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot:
-_PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot:
+.global PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot
+.global _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot
+PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot:
+_PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot:
 
     push_all
     Q         .req w20
@@ -88,13 +111,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top
-.global _PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top:
-_PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top:
+.global PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top
+.global _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top
+PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top:
+_PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top:
 
     push_all
     Q         .req w20
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_poly.S b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_poly.S
index 9d7816c..16d5c2d 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/__asm_poly.S
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/__asm_poly.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_add_reduce
-.global _PQCLEAN_KYBER768_AARCH64_asm_add_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_add_reduce, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_add_reduce:
-_PQCLEAN_KYBER768_AARCH64_asm_add_reduce:
+.global PQCLEAN_KYBER768_AARCH64__asm_add_reduce
+.global _PQCLEAN_KYBER768_AARCH64__asm_add_reduce
+PQCLEAN_KYBER768_AARCH64__asm_add_reduce:
+_PQCLEAN_KYBER768_AARCH64__asm_add_reduce:
 
     mov w4, #3329
     mov w5, #25519
@@ -66,13 +89,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_sub_reduce
-.global _PQCLEAN_KYBER768_AARCH64_asm_sub_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_sub_reduce, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_sub_reduce:
-_PQCLEAN_KYBER768_AARCH64_asm_sub_reduce:
+.global PQCLEAN_KYBER768_AARCH64__asm_sub_reduce
+.global _PQCLEAN_KYBER768_AARCH64__asm_sub_reduce
+PQCLEAN_KYBER768_AARCH64__asm_sub_reduce:
+_PQCLEAN_KYBER768_AARCH64__asm_sub_reduce:
 
     mov w4, #3329
     mov w5, #25519
@@ -130,13 +150,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce
-.global _PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce
-#if !defined(__clang__) && !defined(old_gas_syntax)
-.type PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce, %function
-#endif
-PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce:
-_PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce:
+.global PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce
+.global _PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce
+PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce:
+_PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce:
 
     mov w4, #3329
     mov w5, #25519
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/api.h b/src/kem/kyber/pqclean_kyber768_aarch64/api.h
index 5831019..e17fadc 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/api.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/api.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+
 #ifndef PQCLEAN_KYBER768_AARCH64_API_H
 #define PQCLEAN_KYBER768_AARCH64_API_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/cbd.c b/src/kem/kyber/pqclean_kyber768_aarch64/cbd.c
index f6d9bf3..a1c98b6 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/cbd.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/cbd.c
@@ -1,7 +1,15 @@
-#include "cbd.h"
-#include "params.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #include <arm_neon.h>
 #include <stdint.h>
+#include "params.h"
+#include "cbd.h"
 
 #define vload2(c, ptr) c = vld2q_u8(ptr);
 
@@ -23,7 +31,8 @@
 #define vsublh8(c, a, b) c = (int16x8_t)vsubl_high_u8(a, b);
 
 static
-void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) {
+void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4])
+{
     uint8x16x2_t t, d;      // 4
     uint8x16x2_t a, b;      // 4
     int16x8x4_t res1, res2; // 4
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/cbd.h b/src/kem/kyber/pqclean_kyber768_aarch64/cbd.h
index e1d2fb5..8a1cee5 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/cbd.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/cbd.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef CBD_H
 #define CBD_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/feat.S b/src/kem/kyber/pqclean_kyber768_aarch64/feat.S
new file mode 100644
index 0000000..ce72974
--- /dev/null
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/feat.S
@@ -0,0 +1,168 @@
+
+/*
+MIT License
+
+Copyright (c) 2020 Bas Westerbaan
+Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3)
+
+.macro round
+    ; Execute theta, but without xoring into the state yet.
+    ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
+    eor3.16b v25, v0, v5, v10
+    eor3.16b v26, v1, v6, v11
+    eor3.16b v27, v2, v7, v12
+    eor3.16b v28, v3, v8, v13
+    eor3.16b v29, v4, v9, v14
+
+    eor3.16b v25, v25, v15, v20
+    eor3.16b v26, v26, v16, v21
+    eor3.16b v27, v27, v17, v22
+    eor3.16b v28, v28, v18, v23
+    eor3.16b v29, v29, v19, v24
+
+    rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4]
+    rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2]
+    rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0]
+    rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3]
+    rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1]
+
+    ; Xor parities from step theta into the state at the same time
+    ; as executing rho and pi.
+    eor.16b v0, v0,  v30
+    mov.16b v31, v1
+    xar.2d v1,  v6,  v27, 20
+    xar.2d v6,  v9,  v25, 44
+    xar.2d v9,  v22, v28, 3
+    xar.2d v22, v14, v25, 25
+    xar.2d v14, v20, v30, 46
+    xar.2d v20, v2,  v28, 2
+    xar.2d v2,  v12, v28, 21
+    xar.2d v12, v13, v29, 39
+    xar.2d v13, v19, v25, 56
+    xar.2d v19, v23, v29, 8
+    xar.2d v23, v15, v30, 23
+    xar.2d v15, v4,  v25, 37
+    xar.2d v4,  v24, v25, 50
+    xar.2d v24, v21, v27, 62
+    xar.2d v21, v8,  v29, 9
+    xar.2d v8,  v16, v27, 19
+    xar.2d v16, v5,  v30, 28
+    xar.2d v5,  v3,  v29, 36
+    xar.2d v3,  v18, v29, 43
+    xar.2d v18, v17, v28, 49
+    xar.2d v17, v11, v27, 54
+    xar.2d v11, v7,  v28, 58
+    xar.2d v7,  v10, v30, 61
+    xar.2d v10, v31, v27, 63
+
+    ; Chi
+    bcax.16b v25, v0,  v2,  v1
+    bcax.16b v26, v1,  v3,  v2
+    bcax.16b v2,  v2,  v4,  v3
+    bcax.16b v3,  v3,  v0,  v4
+    bcax.16b v4,  v4,  v1,  v0
+    mov.16b v0, v25
+    mov.16b v1, v26
+
+    bcax.16b v25, v5,  v7,  v6
+    bcax.16b v26, v6,  v8,  v7
+    bcax.16b v7,  v7,  v9,  v8
+    bcax.16b v8,  v8,  v5,  v9
+    bcax.16b v9,  v9,  v6,  v5
+    mov.16b v5, v25
+    mov.16b v6, v26
+
+    bcax.16b v25, v10,  v12,  v11
+    bcax.16b v26, v11,  v13,  v12
+    bcax.16b v12, v12,  v14,  v13
+    bcax.16b v13, v13,  v10,  v14
+    bcax.16b v14, v14,  v11,  v10
+    mov.16b v10, v25
+    mov.16b v11, v26
+
+    bcax.16b v25, v15,  v17,  v16
+    bcax.16b v26, v16,  v18,  v17
+    bcax.16b v17, v17,  v19,  v18
+    bcax.16b v18, v18,  v15,  v19
+    bcax.16b v19, v19,  v16,  v15
+    mov.16b v15, v25
+    mov.16b v16, v26
+
+    bcax.16b v25, v20,  v22,  v21
+    bcax.16b v26, v21,  v23,  v22
+    bcax.16b v22, v22,  v24,  v23
+    bcax.16b v23, v23,  v20,  v24
+    bcax.16b v24, v24,  v21,  v20
+    mov.16b v20, v25
+    mov.16b v21, v26
+
+    ; iota
+    ld1r {v25.2d}, [x1], #8
+    eor.16b v0, v0, v25
+.endm
+
+.align 4
+.global PQCLEAN_KYBER768_AARCH64_f1600x2
+.global _PQCLEAN_KYBER768_AARCH64_f1600x2
+PQCLEAN_KYBER768_AARCH64_f1600x2:
+_PQCLEAN_KYBER768_AARCH64_f1600x2:
+    stp d8,  d9,  [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+
+    mov x2, x0
+    mov x3, #24
+
+    ld1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    ld1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    ld1.2d {v8,  v9,  v10, v11}, [x0], #64
+    ld1.2d {v12, v13, v14, v15}, [x0], #64
+    ld1.2d {v16, v17, v18, v19}, [x0], #64
+    ld1.2d {v20, v21, v22, v23}, [x0], #64
+    ld1.2d {v24}, [x0]
+
+loop:
+    round
+
+    subs x3, x3, #1
+    cbnz x3, loop
+
+    mov x0, x2
+    st1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    st1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    st1.2d {v8,  v9,  v10, v11}, [x0], #64
+    st1.2d {v12, v13, v14, v15}, [x0], #64
+    st1.2d {v16, v17, v18, v19}, [x0], #64
+    st1.2d {v20, v21, v22, v23}, [x0], #64
+    st1.2d {v24}, [x0]
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8,  d9,  [sp], #16
+
+    ret lr
+
+#endif
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.c b/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.c
index 3924900..1c6b603 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.c
@@ -1,6 +1,40 @@
-#include "fips202x2.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
 #include <stddef.h>
+#include "fips202x2.h"
 
 
 #define NROUNDS 24
@@ -67,275 +101,282 @@
 *
 * Arguments:   - uint64_t *state: pointer to input/output Keccak state
 **************************************************/
+extern void PQCLEAN_KYBER768_AARCH64_f1600x2(v128*, const uint64_t*);
 static inline
-void KeccakF1600_StatePermutex2(v128 state[25]) {
-    v128 Aba, Abe, Abi, Abo, Abu;
-    v128 Aga, Age, Agi, Ago, Agu;
-    v128 Aka, Ake, Aki, Ako, Aku;
-    v128 Ama, Ame, Ami, Amo, Amu;
-    v128 Asa, Ase, Asi, Aso, Asu;
-    v128 BCa, BCe, BCi, BCo, BCu; // tmp
-    v128 Da, De, Di, Do, Du;      // D
-    v128 Eba, Ebe, Ebi, Ebo, Ebu;
-    v128 Ega, Ege, Egi, Ego, Egu;
-    v128 Eka, Eke, Eki, Eko, Eku;
-    v128 Ema, Eme, Emi, Emo, Emu;
-    v128 Esa, Ese, Esi, Eso, Esu;
+void KeccakF1600_StatePermutex2(v128 state[25])
+{
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+  PQCLEAN_KYBER768_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+#else
+  v128 Aba, Abe, Abi, Abo, Abu;
+  v128 Aga, Age, Agi, Ago, Agu;
+  v128 Aka, Ake, Aki, Ako, Aku;
+  v128 Ama, Ame, Ami, Amo, Amu;
+  v128 Asa, Ase, Asi, Aso, Asu;
+  v128 BCa, BCe, BCi, BCo, BCu; // tmp
+  v128 Da, De, Di, Do, Du;      // D
+  v128 Eba, Ebe, Ebi, Ebo, Ebu;
+  v128 Ega, Ege, Egi, Ego, Egu;
+  v128 Eka, Eke, Eki, Eko, Eku;
+  v128 Ema, Eme, Emi, Emo, Emu;
+  v128 Esa, Ese, Esi, Eso, Esu;
 
-    //copyFromState(A, state)
-    Aba = state[0];
-    Abe = state[1];
-    Abi = state[2];
-    Abo = state[3];
-    Abu = state[4];
-    Aga = state[5];
-    Age = state[6];
-    Agi = state[7];
-    Ago = state[8];
-    Agu = state[9];
-    Aka = state[10];
-    Ake = state[11];
-    Aki = state[12];
-    Ako = state[13];
-    Aku = state[14];
-    Ama = state[15];
-    Ame = state[16];
-    Ami = state[17];
-    Amo = state[18];
-    Amu = state[19];
-    Asa = state[20];
-    Ase = state[21];
-    Asi = state[22];
-    Aso = state[23];
-    Asu = state[24];
+  //copyFromState(A, state)
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
 
-    for (int round = 0; round < NROUNDS; round += 2) {
-        //    prepareTheta
-        vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
-        vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
-        vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
-        vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
-        vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
+  for (int round = 0; round < NROUNDS; round += 2)
+  {
+    //    prepareTheta
+    vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
+    vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
+    vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
+    vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
+    vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Aba, Aba, Da);
-        vxor(Age, Age, De);
-        vROL(BCe, Age, 44);
-        vxor(Aki, Aki, Di);
-        vROL(BCi, Aki, 43);
-        vxor(Amo, Amo, Do);
-        vROL(BCo, Amo, 21);
-        vxor(Asu, Asu, Du);
-        vROL(BCu, Asu, 14);
-        vXNA(Eba, Aba, BCe, BCi);
-        vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
-        vXNA(Ebe, BCe, BCi, BCo);
-        vXNA(Ebi, BCi, BCo, BCu);
-        vXNA(Ebo, BCo, BCu, Aba);
-        vXNA(Ebu, BCu, Aba, BCe);
+    vxor(Aba, Aba, Da);
+    vxor(Age, Age, De);
+    vROL(BCe, Age, 44);
+    vxor(Aki, Aki, Di);
+    vROL(BCi, Aki, 43);
+    vxor(Amo, Amo, Do);
+    vROL(BCo, Amo, 21);
+    vxor(Asu, Asu, Du);
+    vROL(BCu, Asu, 14);
+    vXNA(Eba, Aba, BCe, BCi);
+    vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
+    vXNA(Ebe, BCe, BCi, BCo);
+    vXNA(Ebi, BCi, BCo, BCu);
+    vXNA(Ebo, BCo, BCu, Aba);
+    vXNA(Ebu, BCu, Aba, BCe);
 
-        vxor(Abo, Abo, Do);
-        vROL(BCa, Abo, 28);
-        vxor(Agu, Agu, Du);
-        vROL(BCe, Agu, 20);
-        vxor(Aka, Aka, Da);
-        vROL(BCi, Aka, 3);
-        vxor(Ame, Ame, De);
-        vROL(BCo, Ame, 45);
-        vxor(Asi, Asi, Di);
-        vROL(BCu, Asi, 61);
-        vXNA(Ega, BCa, BCe, BCi);
-        vXNA(Ege, BCe, BCi, BCo);
-        vXNA(Egi, BCi, BCo, BCu);
-        vXNA(Ego, BCo, BCu, BCa);
-        vXNA(Egu, BCu, BCa, BCe);
+    vxor(Abo, Abo, Do);
+    vROL(BCa, Abo, 28);
+    vxor(Agu, Agu, Du);
+    vROL(BCe, Agu, 20);
+    vxor(Aka, Aka, Da);
+    vROL(BCi, Aka, 3);
+    vxor(Ame, Ame, De);
+    vROL(BCo, Ame, 45);
+    vxor(Asi, Asi, Di);
+    vROL(BCu, Asi, 61);
+    vXNA(Ega, BCa, BCe, BCi);
+    vXNA(Ege, BCe, BCi, BCo);
+    vXNA(Egi, BCi, BCo, BCu);
+    vXNA(Ego, BCo, BCu, BCa);
+    vXNA(Egu, BCu, BCa, BCe);
 
-        vxor(Abe, Abe, De);
-        vROL(BCa, Abe, 1);
-        vxor(Agi, Agi, Di);
-        vROL(BCe, Agi, 6);
-        vxor(Ako, Ako, Do);
-        vROL(BCi, Ako, 25);
-        vxor(Amu, Amu, Du);
-        vROL(BCo, Amu, 8);
-        vxor(Asa, Asa, Da);
-        vROL(BCu, Asa, 18);
-        vXNA(Eka, BCa, BCe, BCi);
-        vXNA(Eke, BCe, BCi, BCo);
-        vXNA(Eki, BCi, BCo, BCu);
-        vXNA(Eko, BCo, BCu, BCa);
-        vXNA(Eku, BCu, BCa, BCe);
+    vxor(Abe, Abe, De);
+    vROL(BCa, Abe, 1);
+    vxor(Agi, Agi, Di);
+    vROL(BCe, Agi, 6);
+    vxor(Ako, Ako, Do);
+    vROL(BCi, Ako, 25);
+    vxor(Amu, Amu, Du);
+    vROL(BCo, Amu, 8);
+    vxor(Asa, Asa, Da);
+    vROL(BCu, Asa, 18);
+    vXNA(Eka, BCa, BCe, BCi);
+    vXNA(Eke, BCe, BCi, BCo);
+    vXNA(Eki, BCi, BCo, BCu);
+    vXNA(Eko, BCo, BCu, BCa);
+    vXNA(Eku, BCu, BCa, BCe);
 
-        vxor(Abu, Abu, Du);
-        vROL(BCa, Abu, 27);
-        vxor(Aga, Aga, Da);
-        vROL(BCe, Aga, 36);
-        vxor(Ake, Ake, De);
-        vROL(BCi, Ake, 10);
-        vxor(Ami, Ami, Di);
-        vROL(BCo, Ami, 15);
-        vxor(Aso, Aso, Do);
-        vROL(BCu, Aso, 56);
-        vXNA(Ema, BCa, BCe, BCi);
-        vXNA(Eme, BCe, BCi, BCo);
-        vXNA(Emi, BCi, BCo, BCu);
-        vXNA(Emo, BCo, BCu, BCa);
-        vXNA(Emu, BCu, BCa, BCe);
+    vxor(Abu, Abu, Du);
+    vROL(BCa, Abu, 27);
+    vxor(Aga, Aga, Da);
+    vROL(BCe, Aga, 36);
+    vxor(Ake, Ake, De);
+    vROL(BCi, Ake, 10);
+    vxor(Ami, Ami, Di);
+    vROL(BCo, Ami, 15);
+    vxor(Aso, Aso, Do);
+    vROL(BCu, Aso, 56);
+    vXNA(Ema, BCa, BCe, BCi);
+    vXNA(Eme, BCe, BCi, BCo);
+    vXNA(Emi, BCi, BCo, BCu);
+    vXNA(Emo, BCo, BCu, BCa);
+    vXNA(Emu, BCu, BCa, BCe);
 
-        vxor(Abi, Abi, Di);
-        vROL(BCa, Abi, 62);
-        vxor(Ago, Ago, Do);
-        vROL(BCe, Ago, 55);
-        vxor(Aku, Aku, Du);
-        vROL(BCi, Aku, 39);
-        vxor(Ama, Ama, Da);
-        vROL(BCo, Ama, 41);
-        vxor(Ase, Ase, De);
-        vROL(BCu, Ase, 2);
-        vXNA(Esa, BCa, BCe, BCi);
-        vXNA(Ese, BCe, BCi, BCo);
-        vXNA(Esi, BCi, BCo, BCu);
-        vXNA(Eso, BCo, BCu, BCa);
-        vXNA(Esu, BCu, BCa, BCe);
+    vxor(Abi, Abi, Di);
+    vROL(BCa, Abi, 62);
+    vxor(Ago, Ago, Do);
+    vROL(BCe, Ago, 55);
+    vxor(Aku, Aku, Du);
+    vROL(BCi, Aku, 39);
+    vxor(Ama, Ama, Da);
+    vROL(BCo, Ama, 41);
+    vxor(Ase, Ase, De);
+    vROL(BCu, Ase, 2);
+    vXNA(Esa, BCa, BCe, BCi);
+    vXNA(Ese, BCe, BCi, BCo);
+    vXNA(Esi, BCi, BCo, BCu);
+    vXNA(Eso, BCo, BCu, BCa);
+    vXNA(Esu, BCu, BCa, BCe);
 
-        // Next Round
+    // Next Round
 
-        //    prepareTheta
-        vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
-        vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
-        vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
-        vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
-        vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
+    //    prepareTheta
+    vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
+    vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
+    vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
+    vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
+    vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Eba, Eba, Da);
-        vxor(Ege, Ege, De);
-        vROL(BCe, Ege, 44);
-        vxor(Eki, Eki, Di);
-        vROL(BCi, Eki, 43);
-        vxor(Emo, Emo, Do);
-        vROL(BCo, Emo, 21);
-        vxor(Esu, Esu, Du);
-        vROL(BCu, Esu, 14);
-        vXNA(Aba, Eba, BCe, BCi);
-        vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
-        vXNA(Abe, BCe, BCi, BCo);
-        vXNA(Abi, BCi, BCo, BCu);
-        vXNA(Abo, BCo, BCu, Eba);
-        vXNA(Abu, BCu, Eba, BCe);
+    vxor(Eba, Eba, Da);
+    vxor(Ege, Ege, De);
+    vROL(BCe, Ege, 44);
+    vxor(Eki, Eki, Di);
+    vROL(BCi, Eki, 43);
+    vxor(Emo, Emo, Do);
+    vROL(BCo, Emo, 21);
+    vxor(Esu, Esu, Du);
+    vROL(BCu, Esu, 14);
+    vXNA(Aba, Eba, BCe, BCi);
+    vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
+    vXNA(Abe, BCe, BCi, BCo);
+    vXNA(Abi, BCi, BCo, BCu);
+    vXNA(Abo, BCo, BCu, Eba);
+    vXNA(Abu, BCu, Eba, BCe);
 
-        vxor(Ebo, Ebo, Do);
-        vROL(BCa, Ebo, 28);
-        vxor(Egu, Egu, Du);
-        vROL(BCe, Egu, 20);
-        vxor(Eka, Eka, Da);
-        vROL(BCi, Eka, 3);
-        vxor(Eme, Eme, De);
-        vROL(BCo, Eme, 45);
-        vxor(Esi, Esi, Di);
-        vROL(BCu, Esi, 61);
-        vXNA(Aga, BCa, BCe, BCi);
-        vXNA(Age, BCe, BCi, BCo);
-        vXNA(Agi, BCi, BCo, BCu);
-        vXNA(Ago, BCo, BCu, BCa);
-        vXNA(Agu, BCu, BCa, BCe);
+    vxor(Ebo, Ebo, Do);
+    vROL(BCa, Ebo, 28);
+    vxor(Egu, Egu, Du);
+    vROL(BCe, Egu, 20);
+    vxor(Eka, Eka, Da);
+    vROL(BCi, Eka, 3);
+    vxor(Eme, Eme, De);
+    vROL(BCo, Eme, 45);
+    vxor(Esi, Esi, Di);
+    vROL(BCu, Esi, 61);
+    vXNA(Aga, BCa, BCe, BCi);
+    vXNA(Age, BCe, BCi, BCo);
+    vXNA(Agi, BCi, BCo, BCu);
+    vXNA(Ago, BCo, BCu, BCa);
+    vXNA(Agu, BCu, BCa, BCe);
 
-        vxor(Ebe, Ebe, De);
-        vROL(BCa, Ebe, 1);
-        vxor(Egi, Egi, Di);
-        vROL(BCe, Egi, 6);
-        vxor(Eko, Eko, Do);
-        vROL(BCi, Eko, 25);
-        vxor(Emu, Emu, Du);
-        vROL(BCo, Emu, 8);
-        vxor(Esa, Esa, Da);
-        vROL(BCu, Esa, 18);
-        vXNA(Aka, BCa, BCe, BCi);
-        vXNA(Ake, BCe, BCi, BCo);
-        vXNA(Aki, BCi, BCo, BCu);
-        vXNA(Ako, BCo, BCu, BCa);
-        vXNA(Aku, BCu, BCa, BCe);
+    vxor(Ebe, Ebe, De);
+    vROL(BCa, Ebe, 1);
+    vxor(Egi, Egi, Di);
+    vROL(BCe, Egi, 6);
+    vxor(Eko, Eko, Do);
+    vROL(BCi, Eko, 25);
+    vxor(Emu, Emu, Du);
+    vROL(BCo, Emu, 8);
+    vxor(Esa, Esa, Da);
+    vROL(BCu, Esa, 18);
+    vXNA(Aka, BCa, BCe, BCi);
+    vXNA(Ake, BCe, BCi, BCo);
+    vXNA(Aki, BCi, BCo, BCu);
+    vXNA(Ako, BCo, BCu, BCa);
+    vXNA(Aku, BCu, BCa, BCe);
 
-        vxor(Ebu, Ebu, Du);
-        vROL(BCa, Ebu, 27);
-        vxor(Ega, Ega, Da);
-        vROL(BCe, Ega, 36);
-        vxor(Eke, Eke, De);
-        vROL(BCi, Eke, 10);
-        vxor(Emi, Emi, Di);
-        vROL(BCo, Emi, 15);
-        vxor(Eso, Eso, Do);
-        vROL(BCu, Eso, 56);
-        vXNA(Ama, BCa, BCe, BCi);
-        vXNA(Ame, BCe, BCi, BCo);
-        vXNA(Ami, BCi, BCo, BCu);
-        vXNA(Amo, BCo, BCu, BCa);
-        vXNA(Amu, BCu, BCa, BCe);
+    vxor(Ebu, Ebu, Du);
+    vROL(BCa, Ebu, 27);
+    vxor(Ega, Ega, Da);
+    vROL(BCe, Ega, 36);
+    vxor(Eke, Eke, De);
+    vROL(BCi, Eke, 10);
+    vxor(Emi, Emi, Di);
+    vROL(BCo, Emi, 15);
+    vxor(Eso, Eso, Do);
+    vROL(BCu, Eso, 56);
+    vXNA(Ama, BCa, BCe, BCi);
+    vXNA(Ame, BCe, BCi, BCo);
+    vXNA(Ami, BCi, BCo, BCu);
+    vXNA(Amo, BCo, BCu, BCa);
+    vXNA(Amu, BCu, BCa, BCe);
 
-        vxor(Ebi, Ebi, Di);
-        vROL(BCa, Ebi, 62);
-        vxor(Ego, Ego, Do);
-        vROL(BCe, Ego, 55);
-        vxor(Eku, Eku, Du);
-        vROL(BCi, Eku, 39);
-        vxor(Ema, Ema, Da);
-        vROL(BCo, Ema, 41);
-        vxor(Ese, Ese, De);
-        vROL(BCu, Ese, 2);
-        vXNA(Asa, BCa, BCe, BCi);
-        vXNA(Ase, BCe, BCi, BCo);
-        vXNA(Asi, BCi, BCo, BCu);
-        vXNA(Aso, BCo, BCu, BCa);
-        vXNA(Asu, BCu, BCa, BCe);
-    }
+    vxor(Ebi, Ebi, Di);
+    vROL(BCa, Ebi, 62);
+    vxor(Ego, Ego, Do);
+    vROL(BCe, Ego, 55);
+    vxor(Eku, Eku, Du);
+    vROL(BCi, Eku, 39);
+    vxor(Ema, Ema, Da);
+    vROL(BCo, Ema, 41);
+    vxor(Ese, Ese, De);
+    vROL(BCu, Ese, 2);
+    vXNA(Asa, BCa, BCe, BCi);
+    vXNA(Ase, BCe, BCi, BCo);
+    vXNA(Asi, BCi, BCo, BCu);
+    vXNA(Aso, BCo, BCu, BCa);
+    vXNA(Asu, BCu, BCa, BCe);
+  }
 
-    state[0] = Aba;
-    state[1] = Abe;
-    state[2] = Abi;
-    state[3] = Abo;
-    state[4] = Abu;
-    state[5] = Aga;
-    state[6] = Age;
-    state[7] = Agi;
-    state[8] = Ago;
-    state[9] = Agu;
-    state[10] = Aka;
-    state[11] = Ake;
-    state[12] = Aki;
-    state[13] = Ako;
-    state[14] = Aku;
-    state[15] = Ama;
-    state[16] = Ame;
-    state[17] = Ami;
-    state[18] = Amo;
-    state[19] = Amu;
-    state[20] = Asa;
-    state[21] = Ase;
-    state[22] = Asi;
-    state[23] = Aso;
-    state[24] = Asu;
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+#endif
 }
 
 /*************************************************
@@ -463,39 +504,41 @@
                             uint8_t *out1,
                             size_t nblocks,
                             unsigned int r,
-                            v128 s[25]) {
-    unsigned int i;
+                            v128 s[25]){
+  unsigned int i;
 
-    uint64x1_t a, b;
-    uint64x2x2_t a2, b2;
+  uint64x1_t a, b;
+  uint64x2x2_t a2, b2;
 
-    while (nblocks > 0) {
-        KeccakF1600_StatePermutex2(s);
+  while (nblocks > 0)
+  {
+    KeccakF1600_StatePermutex2(s);
 
-        for (i = 0; i < r / 8 - 1; i += 4) {
-            a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
-            b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
-            a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
-            b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
-            vst1q_u64_x2((uint64_t *)out0, a2);
-            vst1q_u64_x2((uint64_t *)out1, b2);
+    for (i = 0; i < r / 8 - 1; i += 4)
+    {
+      a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
+      b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
+      a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
+      b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
+      vst1q_u64_x2((uint64_t *)out0, a2);
+      vst1q_u64_x2((uint64_t *)out1, b2);
 
-            out0 += 32;
-            out1 += 32;
-        }
-
-        i = r / 8 - 1;
-        // Last iteration
-        a = vget_low_u64(s[i]);
-        b = vget_high_u64(s[i]);
-        vst1_u64((uint64_t *)out0, a);
-        vst1_u64((uint64_t *)out1, b);
-
-        out0 += 8;
-        out1 += 8;
-
-        --nblocks;
+      out0 += 32;
+      out1 += 32;
     }
+
+    i = r / 8 - 1;
+    // Last iteration
+    a = vget_low_u64(s[i]);
+    b = vget_high_u64(s[i]);
+    vst1_u64((uint64_t *)out0, a);
+    vst1_u64((uint64_t *)out1, b);
+
+    out0 += 8;
+    out1 += 8;
+
+    --nblocks;
+  }
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.h b/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.h
index 7cffd7b..a1eacdf 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/fips202x2.h
@@ -1,10 +1,17 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef FIPS202X2_H
 #define FIPS202X2_H
 
 #include "params.h"
 #include <arm_neon.h>
 #include <stddef.h>
-
 #include "fips202.h"
 
 typedef uint64x2_t v128;
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.c b/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.c
index c273dc4..ff24f15 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.c
@@ -1,15 +1,50 @@
-#include "NTT_params.h"
-#include "indcpa.h"
-#include "ntt.h"
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-#include "randombytes.h"
-#include "rejsample.h"
-#include "symmetric.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+#include "params.h"
+#include "rejsample.h"
+#include "indcpa.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+
+#include "NTT_params.h"
+#include "ntt.h"
 
 /*************************************************
 * Name:        pack_pk
@@ -125,105 +160,113 @@
 **************************************************/
 #define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
 // Not static for benchmarking
-void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed) {
-    unsigned int ctr0, ctr1, k;
-    unsigned int buflen, off;
-    uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
-            buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
-    neon_xof_state state;
+void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed)
+{
+  unsigned int ctr0, ctr1, k;
+  unsigned int buflen, off;
+  uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
+      buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+  neon_xof_state state;
 
-    int16_t *s1 = NULL, *s2 = NULL;
-    unsigned int x1, x2, y1, y2;
-    xof_state c_state;
-    xof_init(&c_state);
+  int16_t *s1 = NULL, *s2 = NULL;
+  unsigned int x1, x2, y1, y2;
+  xof_state c_state;
+  shake128_inc_init(&c_state); // patch
 
-    for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2) {
-        switch (j) {
-        case 0:
-            s1 = &(a[0][0][0]);
-            s2 = &(a[0][1][0]);
-            x1 = 0;
-            y1 = 0;
-            x2 = 0;
-            y2 = 1;
-            break;
-        case 2:
-            s1 = &(a[0][2][0]);
-            s2 = &(a[1][0][0]);
-            x1 = 0;
-            y1 = 2;
-            x2 = 1;
-            y2 = 0;
-            break;
-        case 4:
-            s1 = &(a[1][1][0]);
-            s2 = &(a[1][2][0]);
-            x1 = 1;
-            y1 = 1;
-            x2 = 1;
-            y2 = 2;
-            break;
-        default:
-            s1 = &(a[2][0][0]);
-            s2 = &(a[2][1][0]);
-            x1 = 2;
-            y1 = 0;
-            x2 = 2;
-            y2 = 1;
-            break;
-        }
-
-        if (transposed) {
-            neon_xof_absorb(&state, seed, x1, x2, y1, y2);
-        } else {
-            neon_xof_absorb(&state, seed, y1, y2, x1, x2);
-        }
-
-        neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
-
-        buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
-
-        ctr0 = neon_rej_uniform(s1, buf0);
-        ctr1 = neon_rej_uniform(s2, buf1);
-
-        while (ctr0 < KYBER_N || ctr1 < KYBER_N) {
-            off = buflen % 3;
-            for (k = 0; k < off; k++) {
-                buf0[k] = buf0[buflen - off + k];
-                buf1[k] = buf1[buflen - off + k];
-            }
-            neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
-
-            buflen = off + XOF_BLOCKBYTES;
-            ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen);
-            ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen);
-        }
+  for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2)
+  {
+    switch (j)
+    {
+    case 0:
+      s1 = &(a[0][0][0]);
+      s2 = &(a[0][1][0]);
+      x1 = 0;
+      y1 = 0;
+      x2 = 0;
+      y2 = 1;
+      break;
+    case 2:
+      s1 = &(a[0][2][0]);
+      s2 = &(a[1][0][0]);
+      x1 = 0;
+      y1 = 2;
+      x2 = 1;
+      y2 = 0;
+      break;
+    case 4:
+      s1 = &(a[1][1][0]);
+      s2 = &(a[1][2][0]);
+      x1 = 1;
+      y1 = 1;
+      x2 = 1;
+      y2 = 2;
+      break;
+    default:
+      s1 = &(a[2][0][0]);
+      s2 = &(a[2][1][0]);
+      x1 = 2;
+      y1 = 0;
+      x2 = 2;
+      y2 = 1;
+      break;
     }
 
-    // Last iteration [2][2]
-    if (transposed) {
-        xof_absorb(&c_state, seed, 2, 2);
-    } else {
-        xof_absorb(&c_state, seed, 2, 2);
-    }
+    if (transposed)
+      neon_xof_absorb(&state, seed, x1, x2, y1, y2);
+    else
+      neon_xof_absorb(&state, seed, y1, y2, x1, x2);
 
-    xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state);
+    neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
 
     buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
 
-    ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0);
+    ctr0 = neon_rej_uniform(s1, buf0);
+    ctr1 = neon_rej_uniform(s2, buf1);
 
-    while (ctr0 < KYBER_N) {
-        off = buflen % 3;
-        for (k = 0; k < off; k++) {
-            buf0[k] = buf0[buflen - off + k];
-        }
-        xof_squeezeblocks(buf0 + off, 1, &c_state);
+    while (ctr0 < KYBER_N || ctr1 < KYBER_N)
+    {
+      off = buflen % 3;
+      for (k = 0; k < off; k++)
+      {
+        buf0[k] = buf0[buflen - off + k];
+        buf1[k] = buf1[buflen - off + k];
+      }
+      neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
 
-        buflen = off + XOF_BLOCKBYTES;
-        ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
+      buflen = off + XOF_BLOCKBYTES;
+      ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen);
+      ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen);
     }
-    shake128_inc_ctx_release(&c_state);
+  }
+
+  // Last iteration [2][2]
+  if (transposed){
+    xof_absorb(&c_state, seed, 2, 2);
+  }
+  else{
+    xof_absorb(&c_state, seed, 2, 2);
+  }
+
+  xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state);
+
+  buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+
+  ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0);
+
+  while (ctr0 < KYBER_N)
+  {
+    off = buflen % 3;
+    for (k = 0; k < off; k++)
+    {
+      buf0[k] = buf0[buflen - off + k];
+    }
+    xof_squeezeblocks(buf0 + off, 1, &c_state);
+
+    buflen = off + XOF_BLOCKBYTES;
+    ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
+  }
+
+  shake128_inc_ctx_release(&c_state);
 
 }
 
@@ -263,11 +306,11 @@
     neon_polyvec_ntt(e);
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended(&(skpv_asymmetric[i][0]), &(skpv[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+        PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended(&(skpv_asymmetric[i][0]), &(skpv[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery(&(a[i][0][0]), &(skpv[0][0]), &(skpv_asymmetric[0][0]), asymmetric_const, pkpv[i]);
+        PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery(&(a[i][0][0]), &(skpv[0][0]), &(skpv_asymmetric[0][0]), asymmetric_const, pkpv[i]);
     }
 
     neon_polyvec_add_reduce(pkpv, e);
@@ -321,15 +364,15 @@
 
     neon_polyvec_ntt(sp);
 
-    for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended(&(sp_asymmetric[i][0]), &(sp[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+    for(i = 0; i < KYBER_K; i++){
+        PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended(&(sp_asymmetric[i][0]), &(sp[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
-    for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul(&(at[i][0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, b[i]);
+    for(i = 0; i < KYBER_K; i++){
+        PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul(&(at[i][0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, b[i]);
     }
 
-    PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul(&(pkpv[0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, v);
+    PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul(&(pkpv[0][0]), &(sp[0][0]), &(sp_asymmetric[0][0]), asymmetric_const, v);
 
     neon_polyvec_invntt_to_mont(b);
     invntt(v);
@@ -371,10 +414,10 @@
     neon_polyvec_ntt(b);
 
     for (i = 0; i < KYBER_K; i++) {
-        PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended(&(b_asymmetric[i][0]), &(b[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
+        PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended(&(b_asymmetric[i][0]), &(b[i][0]), pre_asymmetric_table_Q1_extended, asymmetric_const);
     }
 
-    PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul(&(skpv[0][0]), &(b[0][0]), &(b_asymmetric[0][0]), asymmetric_const, mp);
+    PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul(&(skpv[0][0]), &(b[0][0]), &(b_asymmetric[0][0]), asymmetric_const, mp);
 
     invntt(mp);
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.h b/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.h
index f718f39..b74bc0b 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/indcpa.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef INDCPA_H
 #define INDCPA_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/kem.c b/src/kem/kyber/pqclean_kyber768_aarch64/kem.c
index 2880e6d..42b1220 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/kem.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/kem.c
@@ -1,11 +1,18 @@
-#include "indcpa.h"
-#include "kem.h"
-#include "params.h"
-#include "randombytes.h"
-#include "symmetric.h"
-#include "verify.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "params.h"
+#include "indcpa.h"
+#include "verify.h"
+#include "symmetric.h"
+#include "randombytes.h"
+#include "kem.h"
 
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/kem.h b/src/kem/kyber/pqclean_kyber768_aarch64/kem.h
index fc5cec9..fcfa2e8 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/kem.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/kem.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef KEM_H
 #define KEM_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/macros.inc b/src/kem/kyber/pqclean_kyber768_aarch64/macros.inc
index 9e392b0..2add309 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/macros.inc
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/macros.inc
@@ -1,4 +1,30 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef MACROS_S
 #define MACROS_S
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/macros_common.inc b/src/kem/kyber/pqclean_kyber768_aarch64/macros_common.inc
index 26e7cbb..c1ac021 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/macros_common.inc
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/macros_common.inc
@@ -1,4 +1,30 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef MACROS_COMMON
 #define MACROS_COMMON
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/neon_poly.c b/src/kem/kyber/pqclean_kyber768_aarch64/neon_poly.c
index ca51919..cd6ce6e 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/neon_poly.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/neon_poly.c
@@ -1,9 +1,43 @@
-#include "cbd.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
 #include "poly.h"
+#include "ntt.h"
+#include "cbd.h"
 #include "symmetric.h"
-#include <arm_neon.h>
 
 
 /*************************************************
@@ -97,14 +131,14 @@
 *            - const poly *a: pointer to first input polynomial
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_KYBER768_AARCH64_asm_add_reduce(int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER768_AARCH64__asm_add_reduce(int16_t *, const int16_t *);
 void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
-    PQCLEAN_KYBER768_AARCH64_asm_add_reduce(c, a);
+    PQCLEAN_KYBER768_AARCH64__asm_add_reduce(c, a);
 }
 
-extern void PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *);
 void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) {
-    PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(c, a, b);
+    PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(c, a, b);
 }
 
 /*************************************************
@@ -118,7 +152,7 @@
 *            - const poly *a: pointer to first input polynomial
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(int16_t *, const int16_t *);
 void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) {
-    PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(c, a);
+    PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(c, a);
 }
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/neon_polyvec.c b/src/kem/kyber/pqclean_kyber768_aarch64/neon_polyvec.c
index 1af48ea..c05f59d 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/neon_polyvec.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/neon_polyvec.c
@@ -1,10 +1,45 @@
-#include "NTT_params.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
+#include "reduce.h"
+#include "ntt.h"
 #include "poly.h"
 #include "polyvec.h"
-#include "reduce.h"
-#include <arm_neon.h>
+
+#include "NTT_params.h"
 
 #define _V (((1U << 26) + KYBER_Q / 2) / KYBER_Q)
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/neon_symmetric-shake.c b/src/kem/kyber/pqclean_kyber768_aarch64/neon_symmetric-shake.c
index 6515250..8aced5e 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/neon_symmetric-shake.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/neon_symmetric-shake.c
@@ -1,8 +1,42 @@
-#include "fips202x2.h"
-#include "params.h"
-#include "symmetric.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "params.h"
+#include "fips202x2.h"
+#include "symmetric.h"
 
 /*************************************************
 * Name:        kyber_shake128_absorb
@@ -19,22 +53,23 @@
 void neon_kyber_shake128_absorb(keccakx2_state *state,
                                 const uint8_t seed[KYBER_SYMBYTES],
                                 uint8_t x1, uint8_t x2,
-                                uint8_t y1, uint8_t y2) {
-    unsigned int i;
-    uint8_t extseed1[KYBER_SYMBYTES + 2 + 14];
-    uint8_t extseed2[KYBER_SYMBYTES + 2 + 14];
+                                uint8_t y1, uint8_t y2)
+{
+  unsigned int i;
+  uint8_t extseed1[KYBER_SYMBYTES+2];
+  uint8_t extseed2[KYBER_SYMBYTES+2];
 
-    for (i = 0; i < KYBER_SYMBYTES; i++) {
-        extseed1[i] = seed[i];
-        extseed2[i] = seed[i];
-    }
-    extseed1[KYBER_SYMBYTES  ] = x1;
-    extseed1[KYBER_SYMBYTES + 1] = y1;
+  for(i=0;i<KYBER_SYMBYTES;i++){
+    extseed1[i] = seed[i];
+    extseed2[i] = seed[i];
+  }
+  extseed1[KYBER_SYMBYTES  ] = x1;
+  extseed1[KYBER_SYMBYTES+1] = y1;
 
-    extseed2[KYBER_SYMBYTES  ] = x2;
-    extseed2[KYBER_SYMBYTES + 1] = y2;
+  extseed2[KYBER_SYMBYTES  ] = x2;
+  extseed2[KYBER_SYMBYTES+1] = y2;
 
-    shake128x2_absorb(state, extseed1, extseed2, KYBER_SYMBYTES + 2);
+  shake128x2_absorb(state, extseed1, extseed2, sizeof(extseed1));
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/ntt.c b/src/kem/kyber/pqclean_kyber768_aarch64/ntt.c
index 1216c2c..7f28d9a 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/ntt.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/ntt.c
@@ -1,8 +1,35 @@
-#include "NTT_params.h"
-#include "ntt.h"
-#include "params.h"
-#include "reduce.h"
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
+#include "params.h"
+#include "ntt.h"
+#include "reduce.h"
+#include "NTT_params.h"
 
 /*************************************************
 * Name:        ntt
@@ -27,7 +54,8 @@
 * Arguments:   - int16_t r[256] in {-(q-1)/2,...,(q-1)/2}
 *              pointer to input/output vector of elements of Zq
 **************************************************/
-void invntt(int16_t r[256]) {
-    iNTT(r);
+void invntt(int16_t r[256])
+{
+  iNTT(r);
 
 }
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/ntt.h b/src/kem/kyber/pqclean_kyber768_aarch64/ntt.h
index d6990cb..67674cf 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/ntt.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/ntt.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_H
 #define NTT_H
 #include "params.h"
@@ -9,53 +36,55 @@
 
 #define ntt KYBER_NAMESPACE(ntt)
 void ntt(int16_t r[256]);
-
 #define invntt KYBER_NAMESPACE(invntt)
 void invntt(int16_t r[256]);
 
 
-extern void PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(int16_t*, const int16_t*, const int16_t*);
 
-extern void PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot(int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top(int16_t *, const int16_t *, const int16_t *);
+extern void PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot(int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top(int16_t*, const int16_t*, const int16_t*);
 
-extern void PQCLEAN_KYBER768_AARCH64_asm_point_mul_extended(int16_t *, const int16_t *, const int16_t *, const int16_t *);
-extern void PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *);
-extern void PQCLEAN_KYBER768_AARCH64_asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *);
+extern void PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended(int16_t*, const int16_t*, const int16_t*, const int16_t*);
+extern void PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul(const int16_t*, const int16_t*, const int16_t*, const int16_t*, int16_t*);
+extern void PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery(const int16_t*, const int16_t*, const int16_t*, const int16_t*, int16_t*);
 
 static const int16_t asymmetric_const[16] = {
-    Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime
+Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime
 };
 
 #define NTT(in) { \
-        PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
-        PQCLEAN_KYBER768_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
     }
 
 #define iNTT(in) { \
-        PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
-        PQCLEAN_KYBER768_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+	PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+	PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
     }
 
 static const int16_t constants[16] = {
-    Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1,
-    invNQ1_R3modQ1_prime_half,
-    invNQ1_R3modQ1_doubleprime,
-    invNQ1_final_R3modQ1_prime_half,
-    invNQ1_final_R3modQ1_doubleprime
+Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1,
+invNQ1_R3modQ1_prime_half,
+invNQ1_R3modQ1_doubleprime,
+invNQ1_final_R3modQ1_prime_half,
+invNQ1_final_R3modQ1_doubleprime
 };
 
-static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = {
-    0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0
+static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] =
+{
+0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0
 };
 
-static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = {
-    167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175
+static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] =
+{
+167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175
 };
 
-static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = {
-    0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0
+static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] =
+{
+0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0
 };
 
 #endif
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/params.h b/src/kem/kyber/pqclean_kyber768_aarch64/params.h
index 76953c4..9404475 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/params.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/params.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef PARAMS_H
 #define PARAMS_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/poly.c b/src/kem/kyber/pqclean_kyber768_aarch64/poly.c
index 6d1ecdd..dffc655 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/poly.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/poly.c
@@ -1,10 +1,43 @@
-#include "cbd.h"
-#include "ntt.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/blob/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <arm_neon.h>
 #include "params.h"
 #include "poly.h"
+#include "ntt.h"
 #include "reduce.h"
+#include "cbd.h"
 #include "symmetric.h"
-#include <arm_neon.h>
 
 /*************************************************
 * Name:        poly_compress
@@ -139,6 +172,9 @@
     unsigned int i, j;
     int16_t mask;
 
+    #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+    #endif
 
     for (i = 0; i < KYBER_N / 8; i++) {
         for (j = 0; j < 8; j++) {
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/poly.h b/src/kem/kyber/pqclean_kyber768_aarch64/poly.h
index 51657a6..4caf07d 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/poly.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/poly.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef POLY_H
 #define POLY_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.c b/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.c
index 58dc92a..d400348 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.c
@@ -1,7 +1,14 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+#include <stdint.h>
 #include "params.h"
 #include "poly.h"
 #include "polyvec.h"
-#include <stdint.h>
 
 /*************************************************
 * Name:        polyvec_compress
@@ -15,6 +22,31 @@
 void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) {
     unsigned int i, j, k;
 
+    #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    uint16_t t[8];
+    for (i = 0; i < KYBER_K; i++) {
+        for (j = 0; j < KYBER_N / 8; j++) {
+            for (k = 0; k < 8; k++) {
+                t[k]  = a[i][8 * j + k];
+                t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+                t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff;
+            }
+
+            r[ 0] = (t[0] >>  0);
+            r[ 1] = (t[0] >>  8) | (t[1] << 3);
+            r[ 2] = (t[1] >>  5) | (t[2] << 6);
+            r[ 3] = (t[2] >>  2);
+            r[ 4] = (t[2] >> 10) | (t[3] << 1);
+            r[ 5] = (t[3] >>  7) | (t[4] << 4);
+            r[ 6] = (t[4] >>  4) | (t[5] << 7);
+            r[ 7] = (t[5] >>  1);
+            r[ 8] = (t[5] >>  9) | (t[6] << 2);
+            r[ 9] = (t[6] >>  6) | (t[7] << 5);
+            r[10] = (t[7] >>  3);
+            r += 11;
+        }
+    }
+    #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
     uint16_t t[4];
     for (i = 0; i < KYBER_K; i++) {
         for (j = 0; j < KYBER_N / 4; j++) {
@@ -32,6 +64,9 @@
             r += 5;
         }
     }
+    #else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+    #endif
 }
 
 /*************************************************
@@ -47,6 +82,26 @@
 void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
     unsigned int i, j, k;
 
+    #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    uint16_t t[8];
+    for (i = 0; i < KYBER_K; i++) {
+        for (j = 0; j < KYBER_N / 8; j++) {
+            t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
+            t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
+            t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
+            t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
+            t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
+            t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
+            t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
+            t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
+            a += 11;
+
+            for (k = 0; k < 8; k++) {
+                r[i][8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11;
+            }
+        }
+    }
+    #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
     uint16_t t[4];
     for (i = 0; i < KYBER_K; i++) {
         for (j = 0; j < KYBER_N / 4; j++) {
@@ -61,6 +116,9 @@
             }
         }
     }
+    #else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+    #endif
 }
 
 /*************************************************
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.h b/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.h
index 560f267..04a2c5c 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/polyvec.h
@@ -1,3 +1,37 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef POLYVEC_H
 #define POLYVEC_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/reduce.c b/src/kem/kyber/pqclean_kyber768_aarch64/reduce.c
index ec3328c..7143512 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/reduce.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/reduce.c
@@ -1,6 +1,13 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
+#include <stdint.h>
 #include "params.h"
 #include "reduce.h"
-#include <stdint.h>
 
 /*************************************************
 * Name:        montgomery_reduce
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/reduce.h b/src/kem/kyber/pqclean_kyber768_aarch64/reduce.h
index 9a35638..c443afb 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/reduce.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/reduce.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef REDUCE_H
 #define REDUCE_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.c b/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.c
index 05a1990..d694ab8 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.c
@@ -1,7 +1,15 @@
-#include "params.h"
-#include "rejsample.h"
-#include "symmetric.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #include <arm_neon.h>
+#include "params.h"
+#include "symmetric.h"
+#include "rejsample.h"
 
 // Define NEON operation
 // Load 8x16
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.h b/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.h
index 8a94a4d..540c3a0 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/rejsample.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef REJSAMPLE_H
 #define REJSAMPLE_H
 
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/symmetric-shake.c b/src/kem/kyber/pqclean_kyber768_aarch64/symmetric-shake.c
index 9311d5d..e7e7e87 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/symmetric-shake.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/symmetric-shake.c
@@ -1,9 +1,16 @@
-#include "fips202.h"
-#include "params.h"
-#include "symmetric.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include "params.h"
+#include "symmetric.h"
+#include "fips202.h"
 
 /*************************************************
 * Name:        kyber_shake128_absorb
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/symmetric.h b/src/kem/kyber/pqclean_kyber768_aarch64/symmetric.h
index 7da2246..12f6a5c 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/symmetric.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/symmetric.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef SYMMETRIC_H
 #define SYMMETRIC_H
 
@@ -25,7 +33,6 @@
 
 #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
 #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_init(STATE) shake128_inc_init(STATE)
 #define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
 #define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
 #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/verify.c b/src/kem/kyber/pqclean_kyber768_aarch64/verify.c
index 5d53c66..ca30408 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/verify.c
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/verify.c
@@ -1,6 +1,13 @@
-#include "verify.h"
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #include <stddef.h>
 #include <stdint.h>
+#include "verify.h"
 
 /*************************************************
 * Name:        verify
diff --git a/src/kem/kyber/pqclean_kyber768_aarch64/verify.h b/src/kem/kyber/pqclean_kyber768_aarch64/verify.h
index 521f861..18ae986 100644
--- a/src/kem/kyber/pqclean_kyber768_aarch64/verify.h
+++ b/src/kem/kyber/pqclean_kyber768_aarch64/verify.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref
+ */
+
 #ifndef VERIFY_H
 #define VERIFY_H
 
diff --git a/src/sig/dilithium/CMakeLists.txt b/src/sig/dilithium/CMakeLists.txt
index b992392..2dff86f 100644
--- a/src/sig/dilithium/CMakeLists.txt
+++ b/src/sig/dilithium/CMakeLists.txt
@@ -24,7 +24,7 @@
 endif()
 
 if(OQS_ENABLE_SIG_dilithium_2_aarch64)
-    add_library(dilithium_2_aarch64 OBJECT pqclean_dilithium2_aarch64/__asm_iNTT.S pqclean_dilithium2_aarch64/__asm_NTT.S pqclean_dilithium2_aarch64/__asm_poly.S pqclean_dilithium2_aarch64/fips202x2.c pqclean_dilithium2_aarch64/ntt.c pqclean_dilithium2_aarch64/packing.c pqclean_dilithium2_aarch64/poly.c pqclean_dilithium2_aarch64/polyvec.c pqclean_dilithium2_aarch64/reduce.c pqclean_dilithium2_aarch64/rounding.c pqclean_dilithium2_aarch64/sign.c pqclean_dilithium2_aarch64/symmetric-shake.c)
+    add_library(dilithium_2_aarch64 OBJECT pqclean_dilithium2_aarch64/__asm_iNTT.S pqclean_dilithium2_aarch64/__asm_NTT.S pqclean_dilithium2_aarch64/__asm_poly.S pqclean_dilithium2_aarch64/feat.S pqclean_dilithium2_aarch64/fips202x2.c pqclean_dilithium2_aarch64/ntt.c pqclean_dilithium2_aarch64/packing.c pqclean_dilithium2_aarch64/poly.c pqclean_dilithium2_aarch64/polyvec.c pqclean_dilithium2_aarch64/reduce.c pqclean_dilithium2_aarch64/rounding.c pqclean_dilithium2_aarch64/sign.c pqclean_dilithium2_aarch64/symmetric-shake.c)
     target_include_directories(dilithium_2_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_dilithium2_aarch64)
     target_include_directories(dilithium_2_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(dilithium_2_aarch64 PRIVATE)
@@ -50,7 +50,7 @@
 endif()
 
 if(OQS_ENABLE_SIG_dilithium_3_aarch64)
-    add_library(dilithium_3_aarch64 OBJECT pqclean_dilithium3_aarch64/__asm_iNTT.S pqclean_dilithium3_aarch64/__asm_NTT.S pqclean_dilithium3_aarch64/__asm_poly.S pqclean_dilithium3_aarch64/fips202x2.c pqclean_dilithium3_aarch64/ntt.c pqclean_dilithium3_aarch64/packing.c pqclean_dilithium3_aarch64/poly.c pqclean_dilithium3_aarch64/polyvec.c pqclean_dilithium3_aarch64/reduce.c pqclean_dilithium3_aarch64/rounding.c pqclean_dilithium3_aarch64/sign.c pqclean_dilithium3_aarch64/symmetric-shake.c)
+    add_library(dilithium_3_aarch64 OBJECT pqclean_dilithium3_aarch64/__asm_iNTT.S pqclean_dilithium3_aarch64/__asm_NTT.S pqclean_dilithium3_aarch64/__asm_poly.S pqclean_dilithium3_aarch64/feat.S pqclean_dilithium3_aarch64/fips202x2.c pqclean_dilithium3_aarch64/ntt.c pqclean_dilithium3_aarch64/packing.c pqclean_dilithium3_aarch64/poly.c pqclean_dilithium3_aarch64/polyvec.c pqclean_dilithium3_aarch64/reduce.c pqclean_dilithium3_aarch64/rounding.c pqclean_dilithium3_aarch64/sign.c pqclean_dilithium3_aarch64/symmetric-shake.c)
     target_include_directories(dilithium_3_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_dilithium3_aarch64)
     target_include_directories(dilithium_3_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(dilithium_3_aarch64 PRIVATE)
@@ -76,7 +76,7 @@
 endif()
 
 if(OQS_ENABLE_SIG_dilithium_5_aarch64)
-    add_library(dilithium_5_aarch64 OBJECT pqclean_dilithium5_aarch64/__asm_iNTT.S pqclean_dilithium5_aarch64/__asm_NTT.S pqclean_dilithium5_aarch64/__asm_poly.S pqclean_dilithium5_aarch64/fips202x2.c pqclean_dilithium5_aarch64/ntt.c pqclean_dilithium5_aarch64/packing.c pqclean_dilithium5_aarch64/poly.c pqclean_dilithium5_aarch64/polyvec.c pqclean_dilithium5_aarch64/reduce.c pqclean_dilithium5_aarch64/rounding.c pqclean_dilithium5_aarch64/sign.c pqclean_dilithium5_aarch64/symmetric-shake.c)
+    add_library(dilithium_5_aarch64 OBJECT pqclean_dilithium5_aarch64/__asm_iNTT.S pqclean_dilithium5_aarch64/__asm_NTT.S pqclean_dilithium5_aarch64/__asm_poly.S pqclean_dilithium5_aarch64/feat.S pqclean_dilithium5_aarch64/fips202x2.c pqclean_dilithium5_aarch64/ntt.c pqclean_dilithium5_aarch64/packing.c pqclean_dilithium5_aarch64/poly.c pqclean_dilithium5_aarch64/polyvec.c pqclean_dilithium5_aarch64/reduce.c pqclean_dilithium5_aarch64/rounding.c pqclean_dilithium5_aarch64/sign.c pqclean_dilithium5_aarch64/symmetric-shake.c)
     target_include_directories(dilithium_5_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_dilithium5_aarch64)
     target_include_directories(dilithium_5_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(dilithium_5_aarch64 PRIVATE)
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/Makefile b/src/sig/dilithium/pqclean_dilithium2_aarch64/Makefile
deleted file mode 100644
index 4cdd8aa..0000000
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# This Makefile can be used with GNU Make or BSD Make
-
-LIB=libdilithium2_aarch64.a
-HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h
-OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o
-
-
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g
-
-all: $(LIB)
-
-%.o: %.c $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-%.o: %.S $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-$(LIB): $(OBJECTS) $(HEADERS)
-	$(AR) -r $@ $(OBJECTS)
-
-clean:
-	$(RM) $(OBJECTS)
-	$(RM) $(LIB)
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/NTT_params.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/NTT_params.h
index 097952b..661952e 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/NTT_params.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/NTT_params.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_PARAMS_H
 #define NTT_PARAMS_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_NTT.S b/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_NTT.S
index 5809477..946c3c3 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_NTT.S
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_NTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top
+PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -183,13 +206,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot
+PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_iNTT.S b/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_iNTT.S
index 2a5d0fa..56a5b7a 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_iNTT.S
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_iNTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top
+PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -412,13 +435,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot
+PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_poly.S b/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_poly.S
index 2a40d71..e7fe838 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_poly.S
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/__asm_poly.S
@@ -1,15 +1,38 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 #include "params.h"
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32
+PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32:
 
     mov x7, #16
     _10_to_32_loop:
@@ -79,13 +102,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce:
 
     ldr w4, [x1]
 
@@ -175,13 +195,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq:
 
     ldr w4, [x1]
 
@@ -271,13 +288,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze:
 
     ldr w4, [x1]
 
@@ -403,13 +417,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round:
 
     mov w4, #1
 
@@ -552,13 +563,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add:
 
     ld1 {v0.4S}, [x1], #16
     ld1 {v4.4S}, [x2], #16
@@ -604,13 +612,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub:
 
     ld1 {v0.4S}, [x1], #16
     ld1 {v4.4S}, [x2], #16
@@ -656,13 +661,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl:
 
     add x1, x0, #0
 
@@ -726,13 +728,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery
+PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery:
 
     push_all
 
@@ -848,13 +847,10 @@
 
 
 .align 2
-.global PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery
-.global _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery, %function
-#endif
-PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery:
-_PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery:
+.global PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+.global _PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+_PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
 
     push_all
 
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/api.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/api.h
index b4ff11b..2ce4259 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/api.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/api.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PQCLEAN_DILITHIUM2_AARCH64_API_H
 #define PQCLEAN_DILITHIUM2_AARCH64_API_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/feat.S b/src/sig/dilithium/pqclean_dilithium2_aarch64/feat.S
new file mode 100644
index 0000000..63be5df
--- /dev/null
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/feat.S
@@ -0,0 +1,168 @@
+
+/*
+MIT License
+
+Copyright (c) 2020 Bas Westerbaan
+Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3)
+
+.macro round
+    ; Execute theta, but without xoring into the state yet.
+    ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
+    eor3.16b v25, v0, v5, v10
+    eor3.16b v26, v1, v6, v11
+    eor3.16b v27, v2, v7, v12
+    eor3.16b v28, v3, v8, v13
+    eor3.16b v29, v4, v9, v14
+
+    eor3.16b v25, v25, v15, v20
+    eor3.16b v26, v26, v16, v21
+    eor3.16b v27, v27, v17, v22
+    eor3.16b v28, v28, v18, v23
+    eor3.16b v29, v29, v19, v24
+
+    rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4]
+    rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2]
+    rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0]
+    rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3]
+    rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1]
+
+    ; Xor parities from step theta into the state at the same time
+    ; as executing rho and pi.
+    eor.16b v0, v0,  v30
+    mov.16b v31, v1
+    xar.2d v1,  v6,  v27, 20
+    xar.2d v6,  v9,  v25, 44
+    xar.2d v9,  v22, v28, 3
+    xar.2d v22, v14, v25, 25
+    xar.2d v14, v20, v30, 46
+    xar.2d v20, v2,  v28, 2
+    xar.2d v2,  v12, v28, 21
+    xar.2d v12, v13, v29, 39
+    xar.2d v13, v19, v25, 56
+    xar.2d v19, v23, v29, 8
+    xar.2d v23, v15, v30, 23
+    xar.2d v15, v4,  v25, 37
+    xar.2d v4,  v24, v25, 50
+    xar.2d v24, v21, v27, 62
+    xar.2d v21, v8,  v29, 9
+    xar.2d v8,  v16, v27, 19
+    xar.2d v16, v5,  v30, 28
+    xar.2d v5,  v3,  v29, 36
+    xar.2d v3,  v18, v29, 43
+    xar.2d v18, v17, v28, 49
+    xar.2d v17, v11, v27, 54
+    xar.2d v11, v7,  v28, 58
+    xar.2d v7,  v10, v30, 61
+    xar.2d v10, v31, v27, 63
+
+    ; Chi
+    bcax.16b v25, v0,  v2,  v1
+    bcax.16b v26, v1,  v3,  v2
+    bcax.16b v2,  v2,  v4,  v3
+    bcax.16b v3,  v3,  v0,  v4
+    bcax.16b v4,  v4,  v1,  v0
+    mov.16b v0, v25
+    mov.16b v1, v26
+
+    bcax.16b v25, v5,  v7,  v6
+    bcax.16b v26, v6,  v8,  v7
+    bcax.16b v7,  v7,  v9,  v8
+    bcax.16b v8,  v8,  v5,  v9
+    bcax.16b v9,  v9,  v6,  v5
+    mov.16b v5, v25
+    mov.16b v6, v26
+
+    bcax.16b v25, v10,  v12,  v11
+    bcax.16b v26, v11,  v13,  v12
+    bcax.16b v12, v12,  v14,  v13
+    bcax.16b v13, v13,  v10,  v14
+    bcax.16b v14, v14,  v11,  v10
+    mov.16b v10, v25
+    mov.16b v11, v26
+
+    bcax.16b v25, v15,  v17,  v16
+    bcax.16b v26, v16,  v18,  v17
+    bcax.16b v17, v17,  v19,  v18
+    bcax.16b v18, v18,  v15,  v19
+    bcax.16b v19, v19,  v16,  v15
+    mov.16b v15, v25
+    mov.16b v16, v26
+
+    bcax.16b v25, v20,  v22,  v21
+    bcax.16b v26, v21,  v23,  v22
+    bcax.16b v22, v22,  v24,  v23
+    bcax.16b v23, v23,  v20,  v24
+    bcax.16b v24, v24,  v21,  v20
+    mov.16b v20, v25
+    mov.16b v21, v26
+
+    ; iota
+    ld1r {v25.2d}, [x1], #8
+    eor.16b v0, v0, v25
+.endm
+
+.align 4
+.global PQCLEAN_DILITHIUM2_AARCH64_f1600x2
+.global _PQCLEAN_DILITHIUM2_AARCH64_f1600x2
+PQCLEAN_DILITHIUM2_AARCH64_f1600x2:
+_PQCLEAN_DILITHIUM2_AARCH64_f1600x2:
+    stp d8,  d9,  [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+
+    mov x2, x0
+    mov x3, #24
+
+    ld1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    ld1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    ld1.2d {v8,  v9,  v10, v11}, [x0], #64
+    ld1.2d {v12, v13, v14, v15}, [x0], #64
+    ld1.2d {v16, v17, v18, v19}, [x0], #64
+    ld1.2d {v20, v21, v22, v23}, [x0], #64
+    ld1.2d {v24}, [x0]
+
+loop:
+    round
+
+    subs x3, x3, #1
+    cbnz x3, loop
+
+    mov x0, x2
+    st1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    st1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    st1.2d {v8,  v9,  v10, v11}, [x0], #64
+    st1.2d {v12, v13, v14, v15}, [x0], #64
+    st1.2d {v16, v17, v18, v19}, [x0], #64
+    st1.2d {v20, v21, v22, v23}, [x0], #64
+    st1.2d {v24}, [x0]
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8,  d9,  [sp], #16
+
+    ret lr
+
+#endif
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.c
index 3924900..f2faa49 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.c
@@ -1,6 +1,40 @@
-#include "fips202x2.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
 #include <stddef.h>
+#include "fips202x2.h"
 
 
 #define NROUNDS 24
@@ -67,275 +101,282 @@
 *
 * Arguments:   - uint64_t *state: pointer to input/output Keccak state
 **************************************************/
+extern void PQCLEAN_DILITHIUM2_AARCH64_f1600x2(v128*, const uint64_t*);
 static inline
-void KeccakF1600_StatePermutex2(v128 state[25]) {
-    v128 Aba, Abe, Abi, Abo, Abu;
-    v128 Aga, Age, Agi, Ago, Agu;
-    v128 Aka, Ake, Aki, Ako, Aku;
-    v128 Ama, Ame, Ami, Amo, Amu;
-    v128 Asa, Ase, Asi, Aso, Asu;
-    v128 BCa, BCe, BCi, BCo, BCu; // tmp
-    v128 Da, De, Di, Do, Du;      // D
-    v128 Eba, Ebe, Ebi, Ebo, Ebu;
-    v128 Ega, Ege, Egi, Ego, Egu;
-    v128 Eka, Eke, Eki, Eko, Eku;
-    v128 Ema, Eme, Emi, Emo, Emu;
-    v128 Esa, Ese, Esi, Eso, Esu;
+void KeccakF1600_StatePermutex2(v128 state[25])
+{
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+  PQCLEAN_DILITHIUM2_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+#else
+  v128 Aba, Abe, Abi, Abo, Abu;
+  v128 Aga, Age, Agi, Ago, Agu;
+  v128 Aka, Ake, Aki, Ako, Aku;
+  v128 Ama, Ame, Ami, Amo, Amu;
+  v128 Asa, Ase, Asi, Aso, Asu;
+  v128 BCa, BCe, BCi, BCo, BCu; // tmp
+  v128 Da, De, Di, Do, Du;      // D
+  v128 Eba, Ebe, Ebi, Ebo, Ebu;
+  v128 Ega, Ege, Egi, Ego, Egu;
+  v128 Eka, Eke, Eki, Eko, Eku;
+  v128 Ema, Eme, Emi, Emo, Emu;
+  v128 Esa, Ese, Esi, Eso, Esu;
 
-    //copyFromState(A, state)
-    Aba = state[0];
-    Abe = state[1];
-    Abi = state[2];
-    Abo = state[3];
-    Abu = state[4];
-    Aga = state[5];
-    Age = state[6];
-    Agi = state[7];
-    Ago = state[8];
-    Agu = state[9];
-    Aka = state[10];
-    Ake = state[11];
-    Aki = state[12];
-    Ako = state[13];
-    Aku = state[14];
-    Ama = state[15];
-    Ame = state[16];
-    Ami = state[17];
-    Amo = state[18];
-    Amu = state[19];
-    Asa = state[20];
-    Ase = state[21];
-    Asi = state[22];
-    Aso = state[23];
-    Asu = state[24];
+  //copyFromState(A, state)
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
 
-    for (int round = 0; round < NROUNDS; round += 2) {
-        //    prepareTheta
-        vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
-        vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
-        vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
-        vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
-        vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
+  for (int round = 0; round < NROUNDS; round += 2)
+  {
+    //    prepareTheta
+    vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
+    vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
+    vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
+    vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
+    vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Aba, Aba, Da);
-        vxor(Age, Age, De);
-        vROL(BCe, Age, 44);
-        vxor(Aki, Aki, Di);
-        vROL(BCi, Aki, 43);
-        vxor(Amo, Amo, Do);
-        vROL(BCo, Amo, 21);
-        vxor(Asu, Asu, Du);
-        vROL(BCu, Asu, 14);
-        vXNA(Eba, Aba, BCe, BCi);
-        vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
-        vXNA(Ebe, BCe, BCi, BCo);
-        vXNA(Ebi, BCi, BCo, BCu);
-        vXNA(Ebo, BCo, BCu, Aba);
-        vXNA(Ebu, BCu, Aba, BCe);
+    vxor(Aba, Aba, Da);
+    vxor(Age, Age, De);
+    vROL(BCe, Age, 44);
+    vxor(Aki, Aki, Di);
+    vROL(BCi, Aki, 43);
+    vxor(Amo, Amo, Do);
+    vROL(BCo, Amo, 21);
+    vxor(Asu, Asu, Du);
+    vROL(BCu, Asu, 14);
+    vXNA(Eba, Aba, BCe, BCi);
+    vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
+    vXNA(Ebe, BCe, BCi, BCo);
+    vXNA(Ebi, BCi, BCo, BCu);
+    vXNA(Ebo, BCo, BCu, Aba);
+    vXNA(Ebu, BCu, Aba, BCe);
 
-        vxor(Abo, Abo, Do);
-        vROL(BCa, Abo, 28);
-        vxor(Agu, Agu, Du);
-        vROL(BCe, Agu, 20);
-        vxor(Aka, Aka, Da);
-        vROL(BCi, Aka, 3);
-        vxor(Ame, Ame, De);
-        vROL(BCo, Ame, 45);
-        vxor(Asi, Asi, Di);
-        vROL(BCu, Asi, 61);
-        vXNA(Ega, BCa, BCe, BCi);
-        vXNA(Ege, BCe, BCi, BCo);
-        vXNA(Egi, BCi, BCo, BCu);
-        vXNA(Ego, BCo, BCu, BCa);
-        vXNA(Egu, BCu, BCa, BCe);
+    vxor(Abo, Abo, Do);
+    vROL(BCa, Abo, 28);
+    vxor(Agu, Agu, Du);
+    vROL(BCe, Agu, 20);
+    vxor(Aka, Aka, Da);
+    vROL(BCi, Aka, 3);
+    vxor(Ame, Ame, De);
+    vROL(BCo, Ame, 45);
+    vxor(Asi, Asi, Di);
+    vROL(BCu, Asi, 61);
+    vXNA(Ega, BCa, BCe, BCi);
+    vXNA(Ege, BCe, BCi, BCo);
+    vXNA(Egi, BCi, BCo, BCu);
+    vXNA(Ego, BCo, BCu, BCa);
+    vXNA(Egu, BCu, BCa, BCe);
 
-        vxor(Abe, Abe, De);
-        vROL(BCa, Abe, 1);
-        vxor(Agi, Agi, Di);
-        vROL(BCe, Agi, 6);
-        vxor(Ako, Ako, Do);
-        vROL(BCi, Ako, 25);
-        vxor(Amu, Amu, Du);
-        vROL(BCo, Amu, 8);
-        vxor(Asa, Asa, Da);
-        vROL(BCu, Asa, 18);
-        vXNA(Eka, BCa, BCe, BCi);
-        vXNA(Eke, BCe, BCi, BCo);
-        vXNA(Eki, BCi, BCo, BCu);
-        vXNA(Eko, BCo, BCu, BCa);
-        vXNA(Eku, BCu, BCa, BCe);
+    vxor(Abe, Abe, De);
+    vROL(BCa, Abe, 1);
+    vxor(Agi, Agi, Di);
+    vROL(BCe, Agi, 6);
+    vxor(Ako, Ako, Do);
+    vROL(BCi, Ako, 25);
+    vxor(Amu, Amu, Du);
+    vROL(BCo, Amu, 8);
+    vxor(Asa, Asa, Da);
+    vROL(BCu, Asa, 18);
+    vXNA(Eka, BCa, BCe, BCi);
+    vXNA(Eke, BCe, BCi, BCo);
+    vXNA(Eki, BCi, BCo, BCu);
+    vXNA(Eko, BCo, BCu, BCa);
+    vXNA(Eku, BCu, BCa, BCe);
 
-        vxor(Abu, Abu, Du);
-        vROL(BCa, Abu, 27);
-        vxor(Aga, Aga, Da);
-        vROL(BCe, Aga, 36);
-        vxor(Ake, Ake, De);
-        vROL(BCi, Ake, 10);
-        vxor(Ami, Ami, Di);
-        vROL(BCo, Ami, 15);
-        vxor(Aso, Aso, Do);
-        vROL(BCu, Aso, 56);
-        vXNA(Ema, BCa, BCe, BCi);
-        vXNA(Eme, BCe, BCi, BCo);
-        vXNA(Emi, BCi, BCo, BCu);
-        vXNA(Emo, BCo, BCu, BCa);
-        vXNA(Emu, BCu, BCa, BCe);
+    vxor(Abu, Abu, Du);
+    vROL(BCa, Abu, 27);
+    vxor(Aga, Aga, Da);
+    vROL(BCe, Aga, 36);
+    vxor(Ake, Ake, De);
+    vROL(BCi, Ake, 10);
+    vxor(Ami, Ami, Di);
+    vROL(BCo, Ami, 15);
+    vxor(Aso, Aso, Do);
+    vROL(BCu, Aso, 56);
+    vXNA(Ema, BCa, BCe, BCi);
+    vXNA(Eme, BCe, BCi, BCo);
+    vXNA(Emi, BCi, BCo, BCu);
+    vXNA(Emo, BCo, BCu, BCa);
+    vXNA(Emu, BCu, BCa, BCe);
 
-        vxor(Abi, Abi, Di);
-        vROL(BCa, Abi, 62);
-        vxor(Ago, Ago, Do);
-        vROL(BCe, Ago, 55);
-        vxor(Aku, Aku, Du);
-        vROL(BCi, Aku, 39);
-        vxor(Ama, Ama, Da);
-        vROL(BCo, Ama, 41);
-        vxor(Ase, Ase, De);
-        vROL(BCu, Ase, 2);
-        vXNA(Esa, BCa, BCe, BCi);
-        vXNA(Ese, BCe, BCi, BCo);
-        vXNA(Esi, BCi, BCo, BCu);
-        vXNA(Eso, BCo, BCu, BCa);
-        vXNA(Esu, BCu, BCa, BCe);
+    vxor(Abi, Abi, Di);
+    vROL(BCa, Abi, 62);
+    vxor(Ago, Ago, Do);
+    vROL(BCe, Ago, 55);
+    vxor(Aku, Aku, Du);
+    vROL(BCi, Aku, 39);
+    vxor(Ama, Ama, Da);
+    vROL(BCo, Ama, 41);
+    vxor(Ase, Ase, De);
+    vROL(BCu, Ase, 2);
+    vXNA(Esa, BCa, BCe, BCi);
+    vXNA(Ese, BCe, BCi, BCo);
+    vXNA(Esi, BCi, BCo, BCu);
+    vXNA(Eso, BCo, BCu, BCa);
+    vXNA(Esu, BCu, BCa, BCe);
 
-        // Next Round
+    // Next Round
 
-        //    prepareTheta
-        vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
-        vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
-        vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
-        vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
-        vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
+    //    prepareTheta
+    vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
+    vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
+    vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
+    vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
+    vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Eba, Eba, Da);
-        vxor(Ege, Ege, De);
-        vROL(BCe, Ege, 44);
-        vxor(Eki, Eki, Di);
-        vROL(BCi, Eki, 43);
-        vxor(Emo, Emo, Do);
-        vROL(BCo, Emo, 21);
-        vxor(Esu, Esu, Du);
-        vROL(BCu, Esu, 14);
-        vXNA(Aba, Eba, BCe, BCi);
-        vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
-        vXNA(Abe, BCe, BCi, BCo);
-        vXNA(Abi, BCi, BCo, BCu);
-        vXNA(Abo, BCo, BCu, Eba);
-        vXNA(Abu, BCu, Eba, BCe);
+    vxor(Eba, Eba, Da);
+    vxor(Ege, Ege, De);
+    vROL(BCe, Ege, 44);
+    vxor(Eki, Eki, Di);
+    vROL(BCi, Eki, 43);
+    vxor(Emo, Emo, Do);
+    vROL(BCo, Emo, 21);
+    vxor(Esu, Esu, Du);
+    vROL(BCu, Esu, 14);
+    vXNA(Aba, Eba, BCe, BCi);
+    vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
+    vXNA(Abe, BCe, BCi, BCo);
+    vXNA(Abi, BCi, BCo, BCu);
+    vXNA(Abo, BCo, BCu, Eba);
+    vXNA(Abu, BCu, Eba, BCe);
 
-        vxor(Ebo, Ebo, Do);
-        vROL(BCa, Ebo, 28);
-        vxor(Egu, Egu, Du);
-        vROL(BCe, Egu, 20);
-        vxor(Eka, Eka, Da);
-        vROL(BCi, Eka, 3);
-        vxor(Eme, Eme, De);
-        vROL(BCo, Eme, 45);
-        vxor(Esi, Esi, Di);
-        vROL(BCu, Esi, 61);
-        vXNA(Aga, BCa, BCe, BCi);
-        vXNA(Age, BCe, BCi, BCo);
-        vXNA(Agi, BCi, BCo, BCu);
-        vXNA(Ago, BCo, BCu, BCa);
-        vXNA(Agu, BCu, BCa, BCe);
+    vxor(Ebo, Ebo, Do);
+    vROL(BCa, Ebo, 28);
+    vxor(Egu, Egu, Du);
+    vROL(BCe, Egu, 20);
+    vxor(Eka, Eka, Da);
+    vROL(BCi, Eka, 3);
+    vxor(Eme, Eme, De);
+    vROL(BCo, Eme, 45);
+    vxor(Esi, Esi, Di);
+    vROL(BCu, Esi, 61);
+    vXNA(Aga, BCa, BCe, BCi);
+    vXNA(Age, BCe, BCi, BCo);
+    vXNA(Agi, BCi, BCo, BCu);
+    vXNA(Ago, BCo, BCu, BCa);
+    vXNA(Agu, BCu, BCa, BCe);
 
-        vxor(Ebe, Ebe, De);
-        vROL(BCa, Ebe, 1);
-        vxor(Egi, Egi, Di);
-        vROL(BCe, Egi, 6);
-        vxor(Eko, Eko, Do);
-        vROL(BCi, Eko, 25);
-        vxor(Emu, Emu, Du);
-        vROL(BCo, Emu, 8);
-        vxor(Esa, Esa, Da);
-        vROL(BCu, Esa, 18);
-        vXNA(Aka, BCa, BCe, BCi);
-        vXNA(Ake, BCe, BCi, BCo);
-        vXNA(Aki, BCi, BCo, BCu);
-        vXNA(Ako, BCo, BCu, BCa);
-        vXNA(Aku, BCu, BCa, BCe);
+    vxor(Ebe, Ebe, De);
+    vROL(BCa, Ebe, 1);
+    vxor(Egi, Egi, Di);
+    vROL(BCe, Egi, 6);
+    vxor(Eko, Eko, Do);
+    vROL(BCi, Eko, 25);
+    vxor(Emu, Emu, Du);
+    vROL(BCo, Emu, 8);
+    vxor(Esa, Esa, Da);
+    vROL(BCu, Esa, 18);
+    vXNA(Aka, BCa, BCe, BCi);
+    vXNA(Ake, BCe, BCi, BCo);
+    vXNA(Aki, BCi, BCo, BCu);
+    vXNA(Ako, BCo, BCu, BCa);
+    vXNA(Aku, BCu, BCa, BCe);
 
-        vxor(Ebu, Ebu, Du);
-        vROL(BCa, Ebu, 27);
-        vxor(Ega, Ega, Da);
-        vROL(BCe, Ega, 36);
-        vxor(Eke, Eke, De);
-        vROL(BCi, Eke, 10);
-        vxor(Emi, Emi, Di);
-        vROL(BCo, Emi, 15);
-        vxor(Eso, Eso, Do);
-        vROL(BCu, Eso, 56);
-        vXNA(Ama, BCa, BCe, BCi);
-        vXNA(Ame, BCe, BCi, BCo);
-        vXNA(Ami, BCi, BCo, BCu);
-        vXNA(Amo, BCo, BCu, BCa);
-        vXNA(Amu, BCu, BCa, BCe);
+    vxor(Ebu, Ebu, Du);
+    vROL(BCa, Ebu, 27);
+    vxor(Ega, Ega, Da);
+    vROL(BCe, Ega, 36);
+    vxor(Eke, Eke, De);
+    vROL(BCi, Eke, 10);
+    vxor(Emi, Emi, Di);
+    vROL(BCo, Emi, 15);
+    vxor(Eso, Eso, Do);
+    vROL(BCu, Eso, 56);
+    vXNA(Ama, BCa, BCe, BCi);
+    vXNA(Ame, BCe, BCi, BCo);
+    vXNA(Ami, BCi, BCo, BCu);
+    vXNA(Amo, BCo, BCu, BCa);
+    vXNA(Amu, BCu, BCa, BCe);
 
-        vxor(Ebi, Ebi, Di);
-        vROL(BCa, Ebi, 62);
-        vxor(Ego, Ego, Do);
-        vROL(BCe, Ego, 55);
-        vxor(Eku, Eku, Du);
-        vROL(BCi, Eku, 39);
-        vxor(Ema, Ema, Da);
-        vROL(BCo, Ema, 41);
-        vxor(Ese, Ese, De);
-        vROL(BCu, Ese, 2);
-        vXNA(Asa, BCa, BCe, BCi);
-        vXNA(Ase, BCe, BCi, BCo);
-        vXNA(Asi, BCi, BCo, BCu);
-        vXNA(Aso, BCo, BCu, BCa);
-        vXNA(Asu, BCu, BCa, BCe);
-    }
+    vxor(Ebi, Ebi, Di);
+    vROL(BCa, Ebi, 62);
+    vxor(Ego, Ego, Do);
+    vROL(BCe, Ego, 55);
+    vxor(Eku, Eku, Du);
+    vROL(BCi, Eku, 39);
+    vxor(Ema, Ema, Da);
+    vROL(BCo, Ema, 41);
+    vxor(Ese, Ese, De);
+    vROL(BCu, Ese, 2);
+    vXNA(Asa, BCa, BCe, BCi);
+    vXNA(Ase, BCe, BCi, BCo);
+    vXNA(Asi, BCi, BCo, BCu);
+    vXNA(Aso, BCo, BCu, BCa);
+    vXNA(Asu, BCu, BCa, BCe);
+  }
 
-    state[0] = Aba;
-    state[1] = Abe;
-    state[2] = Abi;
-    state[3] = Abo;
-    state[4] = Abu;
-    state[5] = Aga;
-    state[6] = Age;
-    state[7] = Agi;
-    state[8] = Ago;
-    state[9] = Agu;
-    state[10] = Aka;
-    state[11] = Ake;
-    state[12] = Aki;
-    state[13] = Ako;
-    state[14] = Aku;
-    state[15] = Ama;
-    state[16] = Ame;
-    state[17] = Ami;
-    state[18] = Amo;
-    state[19] = Amu;
-    state[20] = Asa;
-    state[21] = Ase;
-    state[22] = Asi;
-    state[23] = Aso;
-    state[24] = Asu;
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+#endif
 }
 
 /*************************************************
@@ -463,39 +504,41 @@
                             uint8_t *out1,
                             size_t nblocks,
                             unsigned int r,
-                            v128 s[25]) {
-    unsigned int i;
+                            v128 s[25]){
+  unsigned int i;
 
-    uint64x1_t a, b;
-    uint64x2x2_t a2, b2;
+  uint64x1_t a, b;
+  uint64x2x2_t a2, b2;
 
-    while (nblocks > 0) {
-        KeccakF1600_StatePermutex2(s);
+  while (nblocks > 0)
+  {
+    KeccakF1600_StatePermutex2(s);
 
-        for (i = 0; i < r / 8 - 1; i += 4) {
-            a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
-            b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
-            a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
-            b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
-            vst1q_u64_x2((uint64_t *)out0, a2);
-            vst1q_u64_x2((uint64_t *)out1, b2);
+    for (i = 0; i < r / 8 - 1; i += 4)
+    {
+      a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
+      b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
+      a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
+      b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
+      vst1q_u64_x2((uint64_t *)out0, a2);
+      vst1q_u64_x2((uint64_t *)out1, b2);
 
-            out0 += 32;
-            out1 += 32;
-        }
-
-        i = r / 8 - 1;
-        // Last iteration
-        a = vget_low_u64(s[i]);
-        b = vget_high_u64(s[i]);
-        vst1_u64((uint64_t *)out0, a);
-        vst1_u64((uint64_t *)out1, b);
-
-        out0 += 8;
-        out1 += 8;
-
-        --nblocks;
+      out0 += 32;
+      out1 += 32;
     }
+
+    i = r / 8 - 1;
+    // Last iteration
+    a = vget_low_u64(s[i]);
+    b = vget_high_u64(s[i]);
+    vst1_u64((uint64_t *)out0, a);
+    vst1_u64((uint64_t *)out1, b);
+
+    out0 += 8;
+    out1 += 8;
+
+    --nblocks;
+  }
 }
 
 /*************************************************
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.h
index e2ee105..84568f3 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/fips202x2.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef FIPS202X2_H
 #define FIPS202X2_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/macros.inc b/src/sig/dilithium/pqclean_dilithium2_aarch64/macros.inc
index 66c1333..ef3af4c 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/macros.inc
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/macros.inc
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros_common.inc"
 
 .macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/macros_common.inc b/src/sig/dilithium/pqclean_dilithium2_aarch64/macros_common.inc
index df151bb..bd7e77e 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/macros_common.inc
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/macros_common.inc
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 // for ABI
 
 .macro push_all
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.c
index 27875b2..d8909dc 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "reduce.h"
 #include <stdint.h>
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.h
index a9d8ede..5543e95 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/ntt.h
@@ -1,29 +1,61 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_H
 #define NTT_H
 #include "NTT_params.h"
 #include "params.h"
 #include <stdint.h>
 
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
 
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
 
 #define NTT(in) { \
-        PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
-        PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
     }
 
 #define iNTT(in) { \
-        PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
-        PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
     }
 
 #define ntt DILITHIUM_NAMESPACE(ntt)
-void ntt(int32_t a[N]);
+void ntt(int32_t a[ARRAY_N]);
 #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont)
-void invntt_tomont(int32_t a[N]);
+void invntt_tomont(int32_t a[ARRAY_N]);
 
 static const int constants[16] = {
     Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime,
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.c
index a93b9d8..9ac5e36 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "packing.h"
 #include "params.h"
 #include "poly.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.h
index 5f49829..03f8933 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/packing.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PACKING_H
 #define PACKING_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/params.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/params.h
index 7c0f0aa..2f121ab 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/params.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/params.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PARAMS_H
 #define PARAMS_H
 
@@ -35,9 +42,17 @@
 #define POLYVECH_PACKEDBYTES (OMEGA + K)
 
 
+#if GAMMA1 == (1 << 17)
 #define POLYZ_PACKEDBYTES   576
+#elif GAMMA1 == (1 << 19)
+#define POLYZ_PACKEDBYTES   640
+#endif
 
+#if GAMMA2 == (DILITHIUM_Q-1)/88
 #define POLYW1_PACKEDBYTES  192
+#elif GAMMA2 == (DILITHIUM_Q-1)/32
+#define POLYW1_PACKEDBYTES  128
+#endif
 
 #define POLYETA_PACKEDBYTES  96
 
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.c
index c627a8d..f6f303a 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "poly.h"
 #include "reduce.h"
@@ -25,11 +57,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(int32_t *, const int32_t *);
 void poly_reduce(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -42,11 +74,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(int32_t *, const int32_t *);
 void poly_caddq(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -59,11 +91,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(int32_t *, const int32_t *);
 void poly_freeze(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -173,11 +205,11 @@
 *              - const poly *a: pointer to first input polynomial
 *              - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
 void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
 
     DBENCH_STOP(*tmul);
 }
@@ -194,11 +226,11 @@
 *              - poly *a0: pointer to output polynomial with coefficients c0
 *              - const poly *a: pointer to input polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
 void poly_power2round(poly *a1, poly *a0, const poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
+    PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
 
     DBENCH_STOP(*tround);
 }
@@ -706,11 +738,11 @@
 * Arguments:   - poly *r: pointer to output polynomial
 *              - const uint8_t *a: byte array with bit-packed polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32(int32_t *, const uint8_t *);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(int32_t *, const uint8_t *);
 void polyt1_unpack(poly *r, const uint8_t *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32(r->coeffs, a);
+    PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(r->coeffs, a);
 
     DBENCH_STOP(*tpack);
 }
@@ -841,6 +873,7 @@
     uint32_t t[4];
     DBENCH_START();
 
+    #if GAMMA1 == (1 << 17)
 
     for (i = 0; i < N / 4; ++i) {
         t[0] = GAMMA1 - a->coeffs[4 * i + 0];
@@ -862,6 +895,25 @@
         r[9 * i + 8]  = t[3] >> 10;
     }
 
+    #elif GAMMA1 == (1 << 19)
+
+    for (i = 0; i < N / 2; ++i) {
+        t[0] = GAMMA1 - a->coeffs[2 * i + 0];
+        t[1] = GAMMA1 - a->coeffs[2 * i + 1];
+
+        r[5 * i + 0]  = t[0];
+        r[5 * i + 1]  = t[0] >> 8;
+        r[5 * i + 2]  = t[0] >> 16;
+        r[5 * i + 2] |= t[1] << 4;
+        r[5 * i + 3]  = t[1] >> 4;
+        r[5 * i + 4]  = t[1] >> 12;
+    }
+
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
@@ -879,6 +931,7 @@
     unsigned int i;
     DBENCH_START();
 
+    #if GAMMA1 == (1 << 17)
 
     for (i = 0; i < N / 4; ++i) {
         r->coeffs[4 * i + 0]  = a[9 * i + 0];
@@ -907,6 +960,28 @@
         r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3];
     }
 
+    #elif GAMMA1 == (1 << 19)
+
+    for (i = 0; i < N / 2; ++i) {
+        r->coeffs[2 * i + 0]  = a[5 * i + 0];
+        r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
+        r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16;
+        r->coeffs[2 * i + 0] &= 0xFFFFF;
+
+        r->coeffs[2 * i + 1]  = a[5 * i + 2] >> 4;
+        r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
+        r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;
+        r->coeffs[2 * i + 0] &= 0xFFFFF;
+
+        r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0];
+        r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1];
+    }
+
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
@@ -925,6 +1000,7 @@
     unsigned int i;
     DBENCH_START();
 
+    #if GAMMA2 == (DILITHIUM_Q-1)/88
 
     for (i = 0; i < N / 4; ++i) {
         r[3 * i + 0]  = a->coeffs[4 * i + 0];
@@ -935,6 +1011,17 @@
         r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2;
     }
 
+    #elif GAMMA2 == (DILITHIUM_Q-1)/32
+
+    for (i = 0; i < N / 2; ++i) {
+        r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
+    }
+
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.h
index bad4e78..9f00fa6 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/poly.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef POLY_H
 #define POLY_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.c
index 1961a9e..83fb05e 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "poly.h"
 #include "polyvec.h"
@@ -146,11 +178,11 @@
 *              - const polyvecl *u: pointer to first input vector
 *              - const polyvecl *v: pointer to second input vector
 **************************************************/
-extern void PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
 void polyvecl_pointwise_acc_montgomery(poly *w,
                                        const polyvecl *u,
                                        const polyvecl *v) {
-    PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
+    PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
 }
 
 /*************************************************
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.h
index 59d2d15..8fb7f73 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/polyvec.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef POLYVEC_H
 #define POLYVEC_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.c
index ab06800..4bf239a 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "params.h"
 #include "reduce.h"
 #include <stdint.h>
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.h
index c8bc606..8ca9a37 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/reduce.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef REDUCE_H
 #define REDUCE_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.c
index 25d70da..91c04d1 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "params.h"
 #include "rounding.h"
 #include <stdint.h>
@@ -40,10 +47,21 @@
     int32_t a1;
 
     a1  = (a + 127) >> 7;
+    #if GAMMA2 == (DILITHIUM_Q-1)/32
+
+    a1  = (a1 * 1025 + (1 << 21)) >> 22;
+    a1 &= 15;
+
+    #elif GAMMA2 == (DILITHIUM_Q-1)/88
 
     a1  = (a1 * 11275 + (1 << 23)) >> 24;
     a1 ^= ((43 - a1) >> 31) & a1;
 
+    #else
+
+#error "No parameter specified"
+
+    #endif
 
     *a0  = a - a1 * 2 * GAMMA2;
     *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q;
@@ -87,10 +105,18 @@
         return a1;
     }
 
+    #if GAMMA2 == (DILITHIUM_Q-1)/32
+
+    if (a0 > 0) {
+        return (a1 + 1) & 15;
+    }
+    return (a1 - 1) & 15;
+    #elif GAMMA2 == (DILITHIUM_Q-1)/88
 
     if (a0 > 0) {
         return (a1 == 43) ?  0 : a1 + 1;
     }
     return (a1 ==  0) ? 43 : a1 - 1;
+    #endif
 
 }
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.h
index ec60cee..a888737 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/rounding.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef ROUNDING_H
 #define ROUNDING_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.c
index e6c032d..a299d72 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "fips202.h"
 #include "packing.h"
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.h
index f577b11..fba1bf1 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/sign.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef SIGN_H
 #define SIGN_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric-shake.c b/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric-shake.c
index 878d655..a53074a 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric-shake.c
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric-shake.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "fips202.h"
 #include "params.h"
 #include "symmetric.h"
diff --git a/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric.h b/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric.h
index af3be4f..3739282 100644
--- a/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric.h
+++ b/src/sig/dilithium/pqclean_dilithium2_aarch64/symmetric.h
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef SYMMETRIC_H
 #define SYMMETRIC_H
 #include "fips202.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/Makefile b/src/sig/dilithium/pqclean_dilithium3_aarch64/Makefile
deleted file mode 100644
index 490d8af..0000000
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# This Makefile can be used with GNU Make or BSD Make
-
-LIB=libdilithium3_aarch64.a
-HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h
-OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o
-
-
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g
-
-all: $(LIB)
-
-%.o: %.c $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-%.o: %.S $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-$(LIB): $(OBJECTS) $(HEADERS)
-	$(AR) -r $@ $(OBJECTS)
-
-clean:
-	$(RM) $(OBJECTS)
-	$(RM) $(LIB)
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/NTT_params.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/NTT_params.h
index 097952b..661952e 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/NTT_params.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/NTT_params.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_PARAMS_H
 #define NTT_PARAMS_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_NTT.S b/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_NTT.S
index ab685db..96d9651 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_NTT.S
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_NTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top
+PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -183,13 +206,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot
+PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_iNTT.S b/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_iNTT.S
index a438319..119f752 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_iNTT.S
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_iNTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top
+PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -412,13 +435,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot
+PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_poly.S b/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_poly.S
index bd41a9f..ed888fd 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_poly.S
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/__asm_poly.S
@@ -1,15 +1,38 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 #include "params.h"
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32
+PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32:
 
     mov x7, #16
     _10_to_32_loop:
@@ -79,13 +102,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce:
 
     ldr w4, [x1]
 
@@ -175,13 +195,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq:
 
     ldr w4, [x1]
 
@@ -271,13 +288,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze:
 
     ldr w4, [x1]
 
@@ -403,13 +417,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round:
 
     mov w4, #1
 
@@ -552,13 +563,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add:
 
     ld1 {v0.4S}, [x1], #16
     ld1 {v4.4S}, [x2], #16
@@ -604,13 +612,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub:
 
     ld1 {v0.4S}, [x1], #16
     ld1 {v4.4S}, [x2], #16
@@ -656,13 +661,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl:
 
     add x1, x0, #0
 
@@ -726,13 +728,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery
+PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery:
 
     push_all
 
@@ -848,13 +847,10 @@
 
 
 .align 2
-.global PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery
-.global _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery, %function
-#endif
-PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery:
-_PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery:
+.global PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+.global _PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+_PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
 
     push_all
 
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/api.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/api.h
index 1a76165..6a05644 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/api.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/api.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PQCLEAN_DILITHIUM3_AARCH64_API_H
 #define PQCLEAN_DILITHIUM3_AARCH64_API_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/feat.S b/src/sig/dilithium/pqclean_dilithium3_aarch64/feat.S
new file mode 100644
index 0000000..358adf6
--- /dev/null
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/feat.S
@@ -0,0 +1,168 @@
+
+/*
+MIT License
+
+Copyright (c) 2020 Bas Westerbaan
+Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3)
+
+.macro round
+    ; Execute theta, but without xoring into the state yet.
+    ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
+    eor3.16b v25, v0, v5, v10
+    eor3.16b v26, v1, v6, v11
+    eor3.16b v27, v2, v7, v12
+    eor3.16b v28, v3, v8, v13
+    eor3.16b v29, v4, v9, v14
+
+    eor3.16b v25, v25, v15, v20
+    eor3.16b v26, v26, v16, v21
+    eor3.16b v27, v27, v17, v22
+    eor3.16b v28, v28, v18, v23
+    eor3.16b v29, v29, v19, v24
+
+    rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4]
+    rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2]
+    rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0]
+    rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3]
+    rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1]
+
+    ; Xor parities from step theta into the state at the same time
+    ; as executing rho and pi.
+    eor.16b v0, v0,  v30
+    mov.16b v31, v1
+    xar.2d v1,  v6,  v27, 20
+    xar.2d v6,  v9,  v25, 44
+    xar.2d v9,  v22, v28, 3
+    xar.2d v22, v14, v25, 25
+    xar.2d v14, v20, v30, 46
+    xar.2d v20, v2,  v28, 2
+    xar.2d v2,  v12, v28, 21
+    xar.2d v12, v13, v29, 39
+    xar.2d v13, v19, v25, 56
+    xar.2d v19, v23, v29, 8
+    xar.2d v23, v15, v30, 23
+    xar.2d v15, v4,  v25, 37
+    xar.2d v4,  v24, v25, 50
+    xar.2d v24, v21, v27, 62
+    xar.2d v21, v8,  v29, 9
+    xar.2d v8,  v16, v27, 19
+    xar.2d v16, v5,  v30, 28
+    xar.2d v5,  v3,  v29, 36
+    xar.2d v3,  v18, v29, 43
+    xar.2d v18, v17, v28, 49
+    xar.2d v17, v11, v27, 54
+    xar.2d v11, v7,  v28, 58
+    xar.2d v7,  v10, v30, 61
+    xar.2d v10, v31, v27, 63
+
+    ; Chi
+    bcax.16b v25, v0,  v2,  v1
+    bcax.16b v26, v1,  v3,  v2
+    bcax.16b v2,  v2,  v4,  v3
+    bcax.16b v3,  v3,  v0,  v4
+    bcax.16b v4,  v4,  v1,  v0
+    mov.16b v0, v25
+    mov.16b v1, v26
+
+    bcax.16b v25, v5,  v7,  v6
+    bcax.16b v26, v6,  v8,  v7
+    bcax.16b v7,  v7,  v9,  v8
+    bcax.16b v8,  v8,  v5,  v9
+    bcax.16b v9,  v9,  v6,  v5
+    mov.16b v5, v25
+    mov.16b v6, v26
+
+    bcax.16b v25, v10,  v12,  v11
+    bcax.16b v26, v11,  v13,  v12
+    bcax.16b v12, v12,  v14,  v13
+    bcax.16b v13, v13,  v10,  v14
+    bcax.16b v14, v14,  v11,  v10
+    mov.16b v10, v25
+    mov.16b v11, v26
+
+    bcax.16b v25, v15,  v17,  v16
+    bcax.16b v26, v16,  v18,  v17
+    bcax.16b v17, v17,  v19,  v18
+    bcax.16b v18, v18,  v15,  v19
+    bcax.16b v19, v19,  v16,  v15
+    mov.16b v15, v25
+    mov.16b v16, v26
+
+    bcax.16b v25, v20,  v22,  v21
+    bcax.16b v26, v21,  v23,  v22
+    bcax.16b v22, v22,  v24,  v23
+    bcax.16b v23, v23,  v20,  v24
+    bcax.16b v24, v24,  v21,  v20
+    mov.16b v20, v25
+    mov.16b v21, v26
+
+    ; iota
+    ld1r {v25.2d}, [x1], #8
+    eor.16b v0, v0, v25
+.endm
+
+.align 4
+.global PQCLEAN_DILITHIUM3_AARCH64_f1600x2
+.global _PQCLEAN_DILITHIUM3_AARCH64_f1600x2
+PQCLEAN_DILITHIUM3_AARCH64_f1600x2:
+_PQCLEAN_DILITHIUM3_AARCH64_f1600x2:
+    stp d8,  d9,  [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+
+    mov x2, x0
+    mov x3, #24
+
+    ld1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    ld1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    ld1.2d {v8,  v9,  v10, v11}, [x0], #64
+    ld1.2d {v12, v13, v14, v15}, [x0], #64
+    ld1.2d {v16, v17, v18, v19}, [x0], #64
+    ld1.2d {v20, v21, v22, v23}, [x0], #64
+    ld1.2d {v24}, [x0]
+
+loop:
+    round
+
+    subs x3, x3, #1
+    cbnz x3, loop
+
+    mov x0, x2
+    st1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    st1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    st1.2d {v8,  v9,  v10, v11}, [x0], #64
+    st1.2d {v12, v13, v14, v15}, [x0], #64
+    st1.2d {v16, v17, v18, v19}, [x0], #64
+    st1.2d {v20, v21, v22, v23}, [x0], #64
+    st1.2d {v24}, [x0]
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8,  d9,  [sp], #16
+
+    ret lr
+
+#endif
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.c
index 3924900..e36a678 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.c
@@ -1,6 +1,40 @@
-#include "fips202x2.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
 #include <stddef.h>
+#include "fips202x2.h"
 
 
 #define NROUNDS 24
@@ -67,275 +101,282 @@
 *
 * Arguments:   - uint64_t *state: pointer to input/output Keccak state
 **************************************************/
+extern void PQCLEAN_DILITHIUM3_AARCH64_f1600x2(v128*, const uint64_t*);
 static inline
-void KeccakF1600_StatePermutex2(v128 state[25]) {
-    v128 Aba, Abe, Abi, Abo, Abu;
-    v128 Aga, Age, Agi, Ago, Agu;
-    v128 Aka, Ake, Aki, Ako, Aku;
-    v128 Ama, Ame, Ami, Amo, Amu;
-    v128 Asa, Ase, Asi, Aso, Asu;
-    v128 BCa, BCe, BCi, BCo, BCu; // tmp
-    v128 Da, De, Di, Do, Du;      // D
-    v128 Eba, Ebe, Ebi, Ebo, Ebu;
-    v128 Ega, Ege, Egi, Ego, Egu;
-    v128 Eka, Eke, Eki, Eko, Eku;
-    v128 Ema, Eme, Emi, Emo, Emu;
-    v128 Esa, Ese, Esi, Eso, Esu;
+void KeccakF1600_StatePermutex2(v128 state[25])
+{
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+  PQCLEAN_DILITHIUM3_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+#else
+  v128 Aba, Abe, Abi, Abo, Abu;
+  v128 Aga, Age, Agi, Ago, Agu;
+  v128 Aka, Ake, Aki, Ako, Aku;
+  v128 Ama, Ame, Ami, Amo, Amu;
+  v128 Asa, Ase, Asi, Aso, Asu;
+  v128 BCa, BCe, BCi, BCo, BCu; // tmp
+  v128 Da, De, Di, Do, Du;      // D
+  v128 Eba, Ebe, Ebi, Ebo, Ebu;
+  v128 Ega, Ege, Egi, Ego, Egu;
+  v128 Eka, Eke, Eki, Eko, Eku;
+  v128 Ema, Eme, Emi, Emo, Emu;
+  v128 Esa, Ese, Esi, Eso, Esu;
 
-    //copyFromState(A, state)
-    Aba = state[0];
-    Abe = state[1];
-    Abi = state[2];
-    Abo = state[3];
-    Abu = state[4];
-    Aga = state[5];
-    Age = state[6];
-    Agi = state[7];
-    Ago = state[8];
-    Agu = state[9];
-    Aka = state[10];
-    Ake = state[11];
-    Aki = state[12];
-    Ako = state[13];
-    Aku = state[14];
-    Ama = state[15];
-    Ame = state[16];
-    Ami = state[17];
-    Amo = state[18];
-    Amu = state[19];
-    Asa = state[20];
-    Ase = state[21];
-    Asi = state[22];
-    Aso = state[23];
-    Asu = state[24];
+  //copyFromState(A, state)
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
 
-    for (int round = 0; round < NROUNDS; round += 2) {
-        //    prepareTheta
-        vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
-        vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
-        vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
-        vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
-        vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
+  for (int round = 0; round < NROUNDS; round += 2)
+  {
+    //    prepareTheta
+    vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
+    vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
+    vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
+    vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
+    vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Aba, Aba, Da);
-        vxor(Age, Age, De);
-        vROL(BCe, Age, 44);
-        vxor(Aki, Aki, Di);
-        vROL(BCi, Aki, 43);
-        vxor(Amo, Amo, Do);
-        vROL(BCo, Amo, 21);
-        vxor(Asu, Asu, Du);
-        vROL(BCu, Asu, 14);
-        vXNA(Eba, Aba, BCe, BCi);
-        vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
-        vXNA(Ebe, BCe, BCi, BCo);
-        vXNA(Ebi, BCi, BCo, BCu);
-        vXNA(Ebo, BCo, BCu, Aba);
-        vXNA(Ebu, BCu, Aba, BCe);
+    vxor(Aba, Aba, Da);
+    vxor(Age, Age, De);
+    vROL(BCe, Age, 44);
+    vxor(Aki, Aki, Di);
+    vROL(BCi, Aki, 43);
+    vxor(Amo, Amo, Do);
+    vROL(BCo, Amo, 21);
+    vxor(Asu, Asu, Du);
+    vROL(BCu, Asu, 14);
+    vXNA(Eba, Aba, BCe, BCi);
+    vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
+    vXNA(Ebe, BCe, BCi, BCo);
+    vXNA(Ebi, BCi, BCo, BCu);
+    vXNA(Ebo, BCo, BCu, Aba);
+    vXNA(Ebu, BCu, Aba, BCe);
 
-        vxor(Abo, Abo, Do);
-        vROL(BCa, Abo, 28);
-        vxor(Agu, Agu, Du);
-        vROL(BCe, Agu, 20);
-        vxor(Aka, Aka, Da);
-        vROL(BCi, Aka, 3);
-        vxor(Ame, Ame, De);
-        vROL(BCo, Ame, 45);
-        vxor(Asi, Asi, Di);
-        vROL(BCu, Asi, 61);
-        vXNA(Ega, BCa, BCe, BCi);
-        vXNA(Ege, BCe, BCi, BCo);
-        vXNA(Egi, BCi, BCo, BCu);
-        vXNA(Ego, BCo, BCu, BCa);
-        vXNA(Egu, BCu, BCa, BCe);
+    vxor(Abo, Abo, Do);
+    vROL(BCa, Abo, 28);
+    vxor(Agu, Agu, Du);
+    vROL(BCe, Agu, 20);
+    vxor(Aka, Aka, Da);
+    vROL(BCi, Aka, 3);
+    vxor(Ame, Ame, De);
+    vROL(BCo, Ame, 45);
+    vxor(Asi, Asi, Di);
+    vROL(BCu, Asi, 61);
+    vXNA(Ega, BCa, BCe, BCi);
+    vXNA(Ege, BCe, BCi, BCo);
+    vXNA(Egi, BCi, BCo, BCu);
+    vXNA(Ego, BCo, BCu, BCa);
+    vXNA(Egu, BCu, BCa, BCe);
 
-        vxor(Abe, Abe, De);
-        vROL(BCa, Abe, 1);
-        vxor(Agi, Agi, Di);
-        vROL(BCe, Agi, 6);
-        vxor(Ako, Ako, Do);
-        vROL(BCi, Ako, 25);
-        vxor(Amu, Amu, Du);
-        vROL(BCo, Amu, 8);
-        vxor(Asa, Asa, Da);
-        vROL(BCu, Asa, 18);
-        vXNA(Eka, BCa, BCe, BCi);
-        vXNA(Eke, BCe, BCi, BCo);
-        vXNA(Eki, BCi, BCo, BCu);
-        vXNA(Eko, BCo, BCu, BCa);
-        vXNA(Eku, BCu, BCa, BCe);
+    vxor(Abe, Abe, De);
+    vROL(BCa, Abe, 1);
+    vxor(Agi, Agi, Di);
+    vROL(BCe, Agi, 6);
+    vxor(Ako, Ako, Do);
+    vROL(BCi, Ako, 25);
+    vxor(Amu, Amu, Du);
+    vROL(BCo, Amu, 8);
+    vxor(Asa, Asa, Da);
+    vROL(BCu, Asa, 18);
+    vXNA(Eka, BCa, BCe, BCi);
+    vXNA(Eke, BCe, BCi, BCo);
+    vXNA(Eki, BCi, BCo, BCu);
+    vXNA(Eko, BCo, BCu, BCa);
+    vXNA(Eku, BCu, BCa, BCe);
 
-        vxor(Abu, Abu, Du);
-        vROL(BCa, Abu, 27);
-        vxor(Aga, Aga, Da);
-        vROL(BCe, Aga, 36);
-        vxor(Ake, Ake, De);
-        vROL(BCi, Ake, 10);
-        vxor(Ami, Ami, Di);
-        vROL(BCo, Ami, 15);
-        vxor(Aso, Aso, Do);
-        vROL(BCu, Aso, 56);
-        vXNA(Ema, BCa, BCe, BCi);
-        vXNA(Eme, BCe, BCi, BCo);
-        vXNA(Emi, BCi, BCo, BCu);
-        vXNA(Emo, BCo, BCu, BCa);
-        vXNA(Emu, BCu, BCa, BCe);
+    vxor(Abu, Abu, Du);
+    vROL(BCa, Abu, 27);
+    vxor(Aga, Aga, Da);
+    vROL(BCe, Aga, 36);
+    vxor(Ake, Ake, De);
+    vROL(BCi, Ake, 10);
+    vxor(Ami, Ami, Di);
+    vROL(BCo, Ami, 15);
+    vxor(Aso, Aso, Do);
+    vROL(BCu, Aso, 56);
+    vXNA(Ema, BCa, BCe, BCi);
+    vXNA(Eme, BCe, BCi, BCo);
+    vXNA(Emi, BCi, BCo, BCu);
+    vXNA(Emo, BCo, BCu, BCa);
+    vXNA(Emu, BCu, BCa, BCe);
 
-        vxor(Abi, Abi, Di);
-        vROL(BCa, Abi, 62);
-        vxor(Ago, Ago, Do);
-        vROL(BCe, Ago, 55);
-        vxor(Aku, Aku, Du);
-        vROL(BCi, Aku, 39);
-        vxor(Ama, Ama, Da);
-        vROL(BCo, Ama, 41);
-        vxor(Ase, Ase, De);
-        vROL(BCu, Ase, 2);
-        vXNA(Esa, BCa, BCe, BCi);
-        vXNA(Ese, BCe, BCi, BCo);
-        vXNA(Esi, BCi, BCo, BCu);
-        vXNA(Eso, BCo, BCu, BCa);
-        vXNA(Esu, BCu, BCa, BCe);
+    vxor(Abi, Abi, Di);
+    vROL(BCa, Abi, 62);
+    vxor(Ago, Ago, Do);
+    vROL(BCe, Ago, 55);
+    vxor(Aku, Aku, Du);
+    vROL(BCi, Aku, 39);
+    vxor(Ama, Ama, Da);
+    vROL(BCo, Ama, 41);
+    vxor(Ase, Ase, De);
+    vROL(BCu, Ase, 2);
+    vXNA(Esa, BCa, BCe, BCi);
+    vXNA(Ese, BCe, BCi, BCo);
+    vXNA(Esi, BCi, BCo, BCu);
+    vXNA(Eso, BCo, BCu, BCa);
+    vXNA(Esu, BCu, BCa, BCe);
 
-        // Next Round
+    // Next Round
 
-        //    prepareTheta
-        vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
-        vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
-        vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
-        vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
-        vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
+    //    prepareTheta
+    vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
+    vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
+    vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
+    vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
+    vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Eba, Eba, Da);
-        vxor(Ege, Ege, De);
-        vROL(BCe, Ege, 44);
-        vxor(Eki, Eki, Di);
-        vROL(BCi, Eki, 43);
-        vxor(Emo, Emo, Do);
-        vROL(BCo, Emo, 21);
-        vxor(Esu, Esu, Du);
-        vROL(BCu, Esu, 14);
-        vXNA(Aba, Eba, BCe, BCi);
-        vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
-        vXNA(Abe, BCe, BCi, BCo);
-        vXNA(Abi, BCi, BCo, BCu);
-        vXNA(Abo, BCo, BCu, Eba);
-        vXNA(Abu, BCu, Eba, BCe);
+    vxor(Eba, Eba, Da);
+    vxor(Ege, Ege, De);
+    vROL(BCe, Ege, 44);
+    vxor(Eki, Eki, Di);
+    vROL(BCi, Eki, 43);
+    vxor(Emo, Emo, Do);
+    vROL(BCo, Emo, 21);
+    vxor(Esu, Esu, Du);
+    vROL(BCu, Esu, 14);
+    vXNA(Aba, Eba, BCe, BCi);
+    vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
+    vXNA(Abe, BCe, BCi, BCo);
+    vXNA(Abi, BCi, BCo, BCu);
+    vXNA(Abo, BCo, BCu, Eba);
+    vXNA(Abu, BCu, Eba, BCe);
 
-        vxor(Ebo, Ebo, Do);
-        vROL(BCa, Ebo, 28);
-        vxor(Egu, Egu, Du);
-        vROL(BCe, Egu, 20);
-        vxor(Eka, Eka, Da);
-        vROL(BCi, Eka, 3);
-        vxor(Eme, Eme, De);
-        vROL(BCo, Eme, 45);
-        vxor(Esi, Esi, Di);
-        vROL(BCu, Esi, 61);
-        vXNA(Aga, BCa, BCe, BCi);
-        vXNA(Age, BCe, BCi, BCo);
-        vXNA(Agi, BCi, BCo, BCu);
-        vXNA(Ago, BCo, BCu, BCa);
-        vXNA(Agu, BCu, BCa, BCe);
+    vxor(Ebo, Ebo, Do);
+    vROL(BCa, Ebo, 28);
+    vxor(Egu, Egu, Du);
+    vROL(BCe, Egu, 20);
+    vxor(Eka, Eka, Da);
+    vROL(BCi, Eka, 3);
+    vxor(Eme, Eme, De);
+    vROL(BCo, Eme, 45);
+    vxor(Esi, Esi, Di);
+    vROL(BCu, Esi, 61);
+    vXNA(Aga, BCa, BCe, BCi);
+    vXNA(Age, BCe, BCi, BCo);
+    vXNA(Agi, BCi, BCo, BCu);
+    vXNA(Ago, BCo, BCu, BCa);
+    vXNA(Agu, BCu, BCa, BCe);
 
-        vxor(Ebe, Ebe, De);
-        vROL(BCa, Ebe, 1);
-        vxor(Egi, Egi, Di);
-        vROL(BCe, Egi, 6);
-        vxor(Eko, Eko, Do);
-        vROL(BCi, Eko, 25);
-        vxor(Emu, Emu, Du);
-        vROL(BCo, Emu, 8);
-        vxor(Esa, Esa, Da);
-        vROL(BCu, Esa, 18);
-        vXNA(Aka, BCa, BCe, BCi);
-        vXNA(Ake, BCe, BCi, BCo);
-        vXNA(Aki, BCi, BCo, BCu);
-        vXNA(Ako, BCo, BCu, BCa);
-        vXNA(Aku, BCu, BCa, BCe);
+    vxor(Ebe, Ebe, De);
+    vROL(BCa, Ebe, 1);
+    vxor(Egi, Egi, Di);
+    vROL(BCe, Egi, 6);
+    vxor(Eko, Eko, Do);
+    vROL(BCi, Eko, 25);
+    vxor(Emu, Emu, Du);
+    vROL(BCo, Emu, 8);
+    vxor(Esa, Esa, Da);
+    vROL(BCu, Esa, 18);
+    vXNA(Aka, BCa, BCe, BCi);
+    vXNA(Ake, BCe, BCi, BCo);
+    vXNA(Aki, BCi, BCo, BCu);
+    vXNA(Ako, BCo, BCu, BCa);
+    vXNA(Aku, BCu, BCa, BCe);
 
-        vxor(Ebu, Ebu, Du);
-        vROL(BCa, Ebu, 27);
-        vxor(Ega, Ega, Da);
-        vROL(BCe, Ega, 36);
-        vxor(Eke, Eke, De);
-        vROL(BCi, Eke, 10);
-        vxor(Emi, Emi, Di);
-        vROL(BCo, Emi, 15);
-        vxor(Eso, Eso, Do);
-        vROL(BCu, Eso, 56);
-        vXNA(Ama, BCa, BCe, BCi);
-        vXNA(Ame, BCe, BCi, BCo);
-        vXNA(Ami, BCi, BCo, BCu);
-        vXNA(Amo, BCo, BCu, BCa);
-        vXNA(Amu, BCu, BCa, BCe);
+    vxor(Ebu, Ebu, Du);
+    vROL(BCa, Ebu, 27);
+    vxor(Ega, Ega, Da);
+    vROL(BCe, Ega, 36);
+    vxor(Eke, Eke, De);
+    vROL(BCi, Eke, 10);
+    vxor(Emi, Emi, Di);
+    vROL(BCo, Emi, 15);
+    vxor(Eso, Eso, Do);
+    vROL(BCu, Eso, 56);
+    vXNA(Ama, BCa, BCe, BCi);
+    vXNA(Ame, BCe, BCi, BCo);
+    vXNA(Ami, BCi, BCo, BCu);
+    vXNA(Amo, BCo, BCu, BCa);
+    vXNA(Amu, BCu, BCa, BCe);
 
-        vxor(Ebi, Ebi, Di);
-        vROL(BCa, Ebi, 62);
-        vxor(Ego, Ego, Do);
-        vROL(BCe, Ego, 55);
-        vxor(Eku, Eku, Du);
-        vROL(BCi, Eku, 39);
-        vxor(Ema, Ema, Da);
-        vROL(BCo, Ema, 41);
-        vxor(Ese, Ese, De);
-        vROL(BCu, Ese, 2);
-        vXNA(Asa, BCa, BCe, BCi);
-        vXNA(Ase, BCe, BCi, BCo);
-        vXNA(Asi, BCi, BCo, BCu);
-        vXNA(Aso, BCo, BCu, BCa);
-        vXNA(Asu, BCu, BCa, BCe);
-    }
+    vxor(Ebi, Ebi, Di);
+    vROL(BCa, Ebi, 62);
+    vxor(Ego, Ego, Do);
+    vROL(BCe, Ego, 55);
+    vxor(Eku, Eku, Du);
+    vROL(BCi, Eku, 39);
+    vxor(Ema, Ema, Da);
+    vROL(BCo, Ema, 41);
+    vxor(Ese, Ese, De);
+    vROL(BCu, Ese, 2);
+    vXNA(Asa, BCa, BCe, BCi);
+    vXNA(Ase, BCe, BCi, BCo);
+    vXNA(Asi, BCi, BCo, BCu);
+    vXNA(Aso, BCo, BCu, BCa);
+    vXNA(Asu, BCu, BCa, BCe);
+  }
 
-    state[0] = Aba;
-    state[1] = Abe;
-    state[2] = Abi;
-    state[3] = Abo;
-    state[4] = Abu;
-    state[5] = Aga;
-    state[6] = Age;
-    state[7] = Agi;
-    state[8] = Ago;
-    state[9] = Agu;
-    state[10] = Aka;
-    state[11] = Ake;
-    state[12] = Aki;
-    state[13] = Ako;
-    state[14] = Aku;
-    state[15] = Ama;
-    state[16] = Ame;
-    state[17] = Ami;
-    state[18] = Amo;
-    state[19] = Amu;
-    state[20] = Asa;
-    state[21] = Ase;
-    state[22] = Asi;
-    state[23] = Aso;
-    state[24] = Asu;
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+#endif
 }
 
 /*************************************************
@@ -463,39 +504,41 @@
                             uint8_t *out1,
                             size_t nblocks,
                             unsigned int r,
-                            v128 s[25]) {
-    unsigned int i;
+                            v128 s[25]){
+  unsigned int i;
 
-    uint64x1_t a, b;
-    uint64x2x2_t a2, b2;
+  uint64x1_t a, b;
+  uint64x2x2_t a2, b2;
 
-    while (nblocks > 0) {
-        KeccakF1600_StatePermutex2(s);
+  while (nblocks > 0)
+  {
+    KeccakF1600_StatePermutex2(s);
 
-        for (i = 0; i < r / 8 - 1; i += 4) {
-            a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
-            b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
-            a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
-            b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
-            vst1q_u64_x2((uint64_t *)out0, a2);
-            vst1q_u64_x2((uint64_t *)out1, b2);
+    for (i = 0; i < r / 8 - 1; i += 4)
+    {
+      a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
+      b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
+      a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
+      b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
+      vst1q_u64_x2((uint64_t *)out0, a2);
+      vst1q_u64_x2((uint64_t *)out1, b2);
 
-            out0 += 32;
-            out1 += 32;
-        }
-
-        i = r / 8 - 1;
-        // Last iteration
-        a = vget_low_u64(s[i]);
-        b = vget_high_u64(s[i]);
-        vst1_u64((uint64_t *)out0, a);
-        vst1_u64((uint64_t *)out1, b);
-
-        out0 += 8;
-        out1 += 8;
-
-        --nblocks;
+      out0 += 32;
+      out1 += 32;
     }
+
+    i = r / 8 - 1;
+    // Last iteration
+    a = vget_low_u64(s[i]);
+    b = vget_high_u64(s[i]);
+    vst1_u64((uint64_t *)out0, a);
+    vst1_u64((uint64_t *)out1, b);
+
+    out0 += 8;
+    out1 += 8;
+
+    --nblocks;
+  }
 }
 
 /*************************************************
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.h
index e2ee105..84568f3 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/fips202x2.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef FIPS202X2_H
 #define FIPS202X2_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/macros.inc b/src/sig/dilithium/pqclean_dilithium3_aarch64/macros.inc
index 66c1333..ef3af4c 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/macros.inc
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/macros.inc
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros_common.inc"
 
 .macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/macros_common.inc b/src/sig/dilithium/pqclean_dilithium3_aarch64/macros_common.inc
index df151bb..bd7e77e 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/macros_common.inc
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/macros_common.inc
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 // for ABI
 
 .macro push_all
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.c
index 27875b2..d8909dc 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "reduce.h"
 #include <stdint.h>
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.h
index ced33eb..2f16fac 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/ntt.h
@@ -1,29 +1,61 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_H
 #define NTT_H
 #include "NTT_params.h"
 #include "params.h"
 #include <stdint.h>
 
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
 
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
 
 #define NTT(in) { \
-        PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
-        PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
     }
 
 #define iNTT(in) { \
-        PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
-        PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
     }
 
 #define ntt DILITHIUM_NAMESPACE(ntt)
-void ntt(int32_t a[N]);
+void ntt(int32_t a[ARRAY_N]);
 #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont)
-void invntt_tomont(int32_t a[N]);
+void invntt_tomont(int32_t a[ARRAY_N]);
 
 static const int constants[16] = {
     Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime,
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.c
index a93b9d8..9ac5e36 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "packing.h"
 #include "params.h"
 #include "poly.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.h
index 5f49829..03f8933 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/packing.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PACKING_H
 #define PACKING_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/params.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/params.h
index 477a232..0ee6115 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/params.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/params.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PARAMS_H
 #define PARAMS_H
 
@@ -35,9 +42,17 @@
 #define POLYVECH_PACKEDBYTES (OMEGA + K)
 
 
+#if GAMMA1 == (1 << 17)
+#define POLYZ_PACKEDBYTES   576
+#elif GAMMA1 == (1 << 19)
 #define POLYZ_PACKEDBYTES   640
+#endif
 
+#if GAMMA2 == (DILITHIUM_Q-1)/88
+#define POLYW1_PACKEDBYTES  192
+#elif GAMMA2 == (DILITHIUM_Q-1)/32
 #define POLYW1_PACKEDBYTES  128
+#endif
 
 #define POLYETA_PACKEDBYTES 128
 
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.c
index 7bbc86e..f13f981 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "poly.h"
 #include "reduce.h"
@@ -25,11 +57,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(int32_t *, const int32_t *);
 void poly_reduce(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -42,11 +74,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(int32_t *, const int32_t *);
 void poly_caddq(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -59,11 +91,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(int32_t *, const int32_t *);
 void poly_freeze(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -173,11 +205,11 @@
 *              - const poly *a: pointer to first input polynomial
 *              - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
 void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
 
     DBENCH_STOP(*tmul);
 }
@@ -194,11 +226,11 @@
 *              - poly *a0: pointer to output polynomial with coefficients c0
 *              - const poly *a: pointer to input polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
 void poly_power2round(poly *a1, poly *a0, const poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
+    PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
 
     DBENCH_STOP(*tround);
 }
@@ -682,11 +714,11 @@
 * Arguments:   - poly *r: pointer to output polynomial
 *              - const uint8_t *a: byte array with bit-packed polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32(int32_t *, const uint8_t *);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(int32_t *, const uint8_t *);
 void polyt1_unpack(poly *r, const uint8_t *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32(r->coeffs, a);
+    PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(r->coeffs, a);
 
     DBENCH_STOP(*tpack);
 }
@@ -817,6 +849,29 @@
     uint32_t t[4];
     DBENCH_START();
 
+    #if GAMMA1 == (1 << 17)
+
+    for (i = 0; i < N / 4; ++i) {
+        t[0] = GAMMA1 - a->coeffs[4 * i + 0];
+        t[1] = GAMMA1 - a->coeffs[4 * i + 1];
+        t[2] = GAMMA1 - a->coeffs[4 * i + 2];
+        t[3] = GAMMA1 - a->coeffs[4 * i + 3];
+
+        r[9 * i + 0]  = t[0];
+        r[9 * i + 1]  = t[0] >> 8;
+        r[9 * i + 2]  = t[0] >> 16;
+        r[9 * i + 2] |= t[1] << 2;
+        r[9 * i + 3]  = t[1] >> 6;
+        r[9 * i + 4]  = t[1] >> 14;
+        r[9 * i + 4] |= t[2] << 4;
+        r[9 * i + 5]  = t[2] >> 4;
+        r[9 * i + 6]  = t[2] >> 12;
+        r[9 * i + 6] |= t[3] << 6;
+        r[9 * i + 7]  = t[3] >> 2;
+        r[9 * i + 8]  = t[3] >> 10;
+    }
+
+    #elif GAMMA1 == (1 << 19)
 
     for (i = 0; i < N / 2; ++i) {
         t[0] = GAMMA1 - a->coeffs[2 * i + 0];
@@ -830,6 +885,11 @@
         r[5 * i + 4]  = t[1] >> 12;
     }
 
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
@@ -847,6 +907,36 @@
     unsigned int i;
     DBENCH_START();
 
+    #if GAMMA1 == (1 << 17)
+
+    for (i = 0; i < N / 4; ++i) {
+        r->coeffs[4 * i + 0]  = a[9 * i + 0];
+        r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8;
+        r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16;
+        r->coeffs[4 * i + 0] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 1]  = a[9 * i + 2] >> 2;
+        r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6;
+        r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14;
+        r->coeffs[4 * i + 1] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 2]  = a[9 * i + 4] >> 4;
+        r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4;
+        r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12;
+        r->coeffs[4 * i + 2] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 3]  = a[9 * i + 6] >> 6;
+        r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2;
+        r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10;
+        r->coeffs[4 * i + 3] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0];
+        r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1];
+        r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2];
+        r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3];
+    }
+
+    #elif GAMMA1 == (1 << 19)
 
     for (i = 0; i < N / 2; ++i) {
         r->coeffs[2 * i + 0]  = a[5 * i + 0];
@@ -863,6 +953,11 @@
         r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1];
     }
 
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
@@ -881,11 +976,28 @@
     unsigned int i;
     DBENCH_START();
 
+    #if GAMMA2 == (DILITHIUM_Q-1)/88
+
+    for (i = 0; i < N / 4; ++i) {
+        r[3 * i + 0]  = a->coeffs[4 * i + 0];
+        r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6;
+        r[3 * i + 1]  = a->coeffs[4 * i + 1] >> 2;
+        r[3 * i + 1] |= a->coeffs[4 * i + 2] << 4;
+        r[3 * i + 2]  = a->coeffs[4 * i + 2] >> 4;
+        r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2;
+    }
+
+    #elif GAMMA2 == (DILITHIUM_Q-1)/32
 
     for (i = 0; i < N / 2; ++i) {
         r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
     }
 
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.h
index bad4e78..9f00fa6 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/poly.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef POLY_H
 #define POLY_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.c
index 1c3f7cf..b73d210 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "poly.h"
 #include "polyvec.h"
@@ -146,11 +178,11 @@
 *              - const polyvecl *u: pointer to first input vector
 *              - const polyvecl *v: pointer to second input vector
 **************************************************/
-extern void PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
 void polyvecl_pointwise_acc_montgomery(poly *w,
                                        const polyvecl *u,
                                        const polyvecl *v) {
-    PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
+    PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
 }
 
 /*************************************************
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.h
index 59d2d15..8fb7f73 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/polyvec.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef POLYVEC_H
 #define POLYVEC_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.c
index ab06800..4bf239a 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "params.h"
 #include "reduce.h"
 #include <stdint.h>
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.h
index c8bc606..8ca9a37 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/reduce.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef REDUCE_H
 #define REDUCE_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.c
index 15c846c..91c04d1 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "params.h"
 #include "rounding.h"
 #include <stdint.h>
@@ -40,10 +47,21 @@
     int32_t a1;
 
     a1  = (a + 127) >> 7;
+    #if GAMMA2 == (DILITHIUM_Q-1)/32
 
     a1  = (a1 * 1025 + (1 << 21)) >> 22;
     a1 &= 15;
 
+    #elif GAMMA2 == (DILITHIUM_Q-1)/88
+
+    a1  = (a1 * 11275 + (1 << 23)) >> 24;
+    a1 ^= ((43 - a1) >> 31) & a1;
+
+    #else
+
+#error "No parameter specified"
+
+    #endif
 
     *a0  = a - a1 * 2 * GAMMA2;
     *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q;
@@ -87,10 +105,18 @@
         return a1;
     }
 
+    #if GAMMA2 == (DILITHIUM_Q-1)/32
 
     if (a0 > 0) {
         return (a1 + 1) & 15;
     }
     return (a1 - 1) & 15;
+    #elif GAMMA2 == (DILITHIUM_Q-1)/88
+
+    if (a0 > 0) {
+        return (a1 == 43) ?  0 : a1 + 1;
+    }
+    return (a1 ==  0) ? 43 : a1 - 1;
+    #endif
 
 }
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.h
index ec60cee..a888737 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/rounding.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef ROUNDING_H
 #define ROUNDING_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.c
index e6c032d..a299d72 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "fips202.h"
 #include "packing.h"
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.h
index f577b11..fba1bf1 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/sign.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef SIGN_H
 #define SIGN_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric-shake.c b/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric-shake.c
index 878d655..a53074a 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric-shake.c
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric-shake.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "fips202.h"
 #include "params.h"
 #include "symmetric.h"
diff --git a/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric.h b/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric.h
index af3be4f..3739282 100644
--- a/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric.h
+++ b/src/sig/dilithium/pqclean_dilithium3_aarch64/symmetric.h
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef SYMMETRIC_H
 #define SYMMETRIC_H
 #include "fips202.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/Makefile b/src/sig/dilithium/pqclean_dilithium5_aarch64/Makefile
deleted file mode 100644
index c5eae5a..0000000
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# This Makefile can be used with GNU Make or BSD Make
-
-LIB=libdilithium5_aarch64.a
-HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h
-OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o
-
-
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g
-
-all: $(LIB)
-
-%.o: %.c $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-%.o: %.S $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-$(LIB): $(OBJECTS) $(HEADERS)
-	$(AR) -r $@ $(OBJECTS)
-
-clean:
-	$(RM) $(OBJECTS)
-	$(RM) $(LIB)
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/NTT_params.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/NTT_params.h
index 097952b..661952e 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/NTT_params.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/NTT_params.h
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_PARAMS_H
 #define NTT_PARAMS_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_NTT.S b/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_NTT.S
index 07383d3..9cf6143 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_NTT.S
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_NTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top
+PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -183,13 +206,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot
+PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_iNTT.S b/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_iNTT.S
index c87085a..9daebaf 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_iNTT.S
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_iNTT.S
@@ -1,14 +1,37 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top
+PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top:
 
     push_all
     Q         .req w20
@@ -412,13 +435,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot
+PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot:
 
     push_all
     Q         .req w20
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_poly.S b/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_poly.S
index 8428ef8..edcc82b 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_poly.S
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/__asm_poly.S
@@ -1,15 +1,38 @@
 
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros.inc"
 #include "params.h"
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32
+PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32:
 
     mov x7, #16
     _10_to_32_loop:
@@ -79,13 +102,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce:
 
     ldr w4, [x1]
 
@@ -175,13 +195,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq:
 
     ldr w4, [x1]
 
@@ -271,13 +288,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze:
 
     ldr w4, [x1]
 
@@ -403,13 +417,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round:
 
     mov w4, #1
 
@@ -552,13 +563,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add:
 
     ld1 {v0.4S}, [x1], #16
     ld1 {v4.4S}, [x2], #16
@@ -604,13 +612,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub:
 
     ld1 {v0.4S}, [x1], #16
     ld1 {v4.4S}, [x2], #16
@@ -656,13 +661,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl:
 
     add x1, x0, #0
 
@@ -726,13 +728,10 @@
     br lr
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery
+PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery:
 
     push_all
 
@@ -848,13 +847,10 @@
 
 
 .align 2
-.global PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery
-.global _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery
-#ifndef __clang__
-.type PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery, %function
-#endif
-PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery:
-_PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery:
+.global PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+.global _PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery
+PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
+_PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery:
 
     push_all
 
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/api.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/api.h
index 96ae7c6..ab5e2c4 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/api.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/api.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PQCLEAN_DILITHIUM5_AARCH64_API_H
 #define PQCLEAN_DILITHIUM5_AARCH64_API_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/feat.S b/src/sig/dilithium/pqclean_dilithium5_aarch64/feat.S
new file mode 100644
index 0000000..01abc10
--- /dev/null
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/feat.S
@@ -0,0 +1,168 @@
+
+/*
+MIT License
+
+Copyright (c) 2020 Bas Westerbaan
+Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3)
+
+.macro round
+    ; Execute theta, but without xoring into the state yet.
+    ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
+    eor3.16b v25, v0, v5, v10
+    eor3.16b v26, v1, v6, v11
+    eor3.16b v27, v2, v7, v12
+    eor3.16b v28, v3, v8, v13
+    eor3.16b v29, v4, v9, v14
+
+    eor3.16b v25, v25, v15, v20
+    eor3.16b v26, v26, v16, v21
+    eor3.16b v27, v27, v17, v22
+    eor3.16b v28, v28, v18, v23
+    eor3.16b v29, v29, v19, v24
+
+    rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4]
+    rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2]
+    rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0]
+    rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3]
+    rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1]
+
+    ; Xor parities from step theta into the state at the same time
+    ; as executing rho and pi.
+    eor.16b v0, v0,  v30
+    mov.16b v31, v1
+    xar.2d v1,  v6,  v27, 20
+    xar.2d v6,  v9,  v25, 44
+    xar.2d v9,  v22, v28, 3
+    xar.2d v22, v14, v25, 25
+    xar.2d v14, v20, v30, 46
+    xar.2d v20, v2,  v28, 2
+    xar.2d v2,  v12, v28, 21
+    xar.2d v12, v13, v29, 39
+    xar.2d v13, v19, v25, 56
+    xar.2d v19, v23, v29, 8
+    xar.2d v23, v15, v30, 23
+    xar.2d v15, v4,  v25, 37
+    xar.2d v4,  v24, v25, 50
+    xar.2d v24, v21, v27, 62
+    xar.2d v21, v8,  v29, 9
+    xar.2d v8,  v16, v27, 19
+    xar.2d v16, v5,  v30, 28
+    xar.2d v5,  v3,  v29, 36
+    xar.2d v3,  v18, v29, 43
+    xar.2d v18, v17, v28, 49
+    xar.2d v17, v11, v27, 54
+    xar.2d v11, v7,  v28, 58
+    xar.2d v7,  v10, v30, 61
+    xar.2d v10, v31, v27, 63
+
+    ; Chi
+    bcax.16b v25, v0,  v2,  v1
+    bcax.16b v26, v1,  v3,  v2
+    bcax.16b v2,  v2,  v4,  v3
+    bcax.16b v3,  v3,  v0,  v4
+    bcax.16b v4,  v4,  v1,  v0
+    mov.16b v0, v25
+    mov.16b v1, v26
+
+    bcax.16b v25, v5,  v7,  v6
+    bcax.16b v26, v6,  v8,  v7
+    bcax.16b v7,  v7,  v9,  v8
+    bcax.16b v8,  v8,  v5,  v9
+    bcax.16b v9,  v9,  v6,  v5
+    mov.16b v5, v25
+    mov.16b v6, v26
+
+    bcax.16b v25, v10,  v12,  v11
+    bcax.16b v26, v11,  v13,  v12
+    bcax.16b v12, v12,  v14,  v13
+    bcax.16b v13, v13,  v10,  v14
+    bcax.16b v14, v14,  v11,  v10
+    mov.16b v10, v25
+    mov.16b v11, v26
+
+    bcax.16b v25, v15,  v17,  v16
+    bcax.16b v26, v16,  v18,  v17
+    bcax.16b v17, v17,  v19,  v18
+    bcax.16b v18, v18,  v15,  v19
+    bcax.16b v19, v19,  v16,  v15
+    mov.16b v15, v25
+    mov.16b v16, v26
+
+    bcax.16b v25, v20,  v22,  v21
+    bcax.16b v26, v21,  v23,  v22
+    bcax.16b v22, v22,  v24,  v23
+    bcax.16b v23, v23,  v20,  v24
+    bcax.16b v24, v24,  v21,  v20
+    mov.16b v20, v25
+    mov.16b v21, v26
+
+    ; iota
+    ld1r {v25.2d}, [x1], #8
+    eor.16b v0, v0, v25
+.endm
+
+.align 4
+.global PQCLEAN_DILITHIUM5_AARCH64_f1600x2
+.global _PQCLEAN_DILITHIUM5_AARCH64_f1600x2
+PQCLEAN_DILITHIUM5_AARCH64_f1600x2:
+_PQCLEAN_DILITHIUM5_AARCH64_f1600x2:
+    stp d8,  d9,  [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+
+    mov x2, x0
+    mov x3, #24
+
+    ld1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    ld1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    ld1.2d {v8,  v9,  v10, v11}, [x0], #64
+    ld1.2d {v12, v13, v14, v15}, [x0], #64
+    ld1.2d {v16, v17, v18, v19}, [x0], #64
+    ld1.2d {v20, v21, v22, v23}, [x0], #64
+    ld1.2d {v24}, [x0]
+
+loop:
+    round
+
+    subs x3, x3, #1
+    cbnz x3, loop
+
+    mov x0, x2
+    st1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    st1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    st1.2d {v8,  v9,  v10, v11}, [x0], #64
+    st1.2d {v12, v13, v14, v15}, [x0], #64
+    st1.2d {v16, v17, v18, v19}, [x0], #64
+    st1.2d {v20, v21, v22, v23}, [x0], #64
+    st1.2d {v24}, [x0]
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8,  d9,  [sp], #16
+
+    ret lr
+
+#endif
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.c
index 3924900..259b199 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.c
@@ -1,6 +1,40 @@
-#include "fips202x2.h"
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License for this file.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include <arm_neon.h>
 #include <stddef.h>
+#include "fips202x2.h"
 
 
 #define NROUNDS 24
@@ -67,275 +101,282 @@
 *
 * Arguments:   - uint64_t *state: pointer to input/output Keccak state
 **************************************************/
+extern void PQCLEAN_DILITHIUM5_AARCH64_f1600x2(v128*, const uint64_t*);
 static inline
-void KeccakF1600_StatePermutex2(v128 state[25]) {
-    v128 Aba, Abe, Abi, Abo, Abu;
-    v128 Aga, Age, Agi, Ago, Agu;
-    v128 Aka, Ake, Aki, Ako, Aku;
-    v128 Ama, Ame, Ami, Amo, Amu;
-    v128 Asa, Ase, Asi, Aso, Asu;
-    v128 BCa, BCe, BCi, BCo, BCu; // tmp
-    v128 Da, De, Di, Do, Du;      // D
-    v128 Eba, Ebe, Ebi, Ebo, Ebu;
-    v128 Ega, Ege, Egi, Ego, Egu;
-    v128 Eka, Eke, Eki, Eko, Eku;
-    v128 Ema, Eme, Emi, Emo, Emu;
-    v128 Esa, Ese, Esi, Eso, Esu;
+void KeccakF1600_StatePermutex2(v128 state[25])
+{
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+  PQCLEAN_DILITHIUM5_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants);
+#else
+  v128 Aba, Abe, Abi, Abo, Abu;
+  v128 Aga, Age, Agi, Ago, Agu;
+  v128 Aka, Ake, Aki, Ako, Aku;
+  v128 Ama, Ame, Ami, Amo, Amu;
+  v128 Asa, Ase, Asi, Aso, Asu;
+  v128 BCa, BCe, BCi, BCo, BCu; // tmp
+  v128 Da, De, Di, Do, Du;      // D
+  v128 Eba, Ebe, Ebi, Ebo, Ebu;
+  v128 Ega, Ege, Egi, Ego, Egu;
+  v128 Eka, Eke, Eki, Eko, Eku;
+  v128 Ema, Eme, Emi, Emo, Emu;
+  v128 Esa, Ese, Esi, Eso, Esu;
 
-    //copyFromState(A, state)
-    Aba = state[0];
-    Abe = state[1];
-    Abi = state[2];
-    Abo = state[3];
-    Abu = state[4];
-    Aga = state[5];
-    Age = state[6];
-    Agi = state[7];
-    Ago = state[8];
-    Agu = state[9];
-    Aka = state[10];
-    Ake = state[11];
-    Aki = state[12];
-    Ako = state[13];
-    Aku = state[14];
-    Ama = state[15];
-    Ame = state[16];
-    Ami = state[17];
-    Amo = state[18];
-    Amu = state[19];
-    Asa = state[20];
-    Ase = state[21];
-    Asi = state[22];
-    Aso = state[23];
-    Asu = state[24];
+  //copyFromState(A, state)
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
 
-    for (int round = 0; round < NROUNDS; round += 2) {
-        //    prepareTheta
-        vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
-        vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
-        vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
-        vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
-        vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
+  for (int round = 0; round < NROUNDS; round += 2)
+  {
+    //    prepareTheta
+    vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
+    vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
+    vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
+    vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
+    vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Aba, Aba, Da);
-        vxor(Age, Age, De);
-        vROL(BCe, Age, 44);
-        vxor(Aki, Aki, Di);
-        vROL(BCi, Aki, 43);
-        vxor(Amo, Amo, Do);
-        vROL(BCo, Amo, 21);
-        vxor(Asu, Asu, Du);
-        vROL(BCu, Asu, 14);
-        vXNA(Eba, Aba, BCe, BCi);
-        vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
-        vXNA(Ebe, BCe, BCi, BCo);
-        vXNA(Ebi, BCi, BCo, BCu);
-        vXNA(Ebo, BCo, BCu, Aba);
-        vXNA(Ebu, BCu, Aba, BCe);
+    vxor(Aba, Aba, Da);
+    vxor(Age, Age, De);
+    vROL(BCe, Age, 44);
+    vxor(Aki, Aki, Di);
+    vROL(BCi, Aki, 43);
+    vxor(Amo, Amo, Do);
+    vROL(BCo, Amo, 21);
+    vxor(Asu, Asu, Du);
+    vROL(BCu, Asu, 14);
+    vXNA(Eba, Aba, BCe, BCi);
+    vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
+    vXNA(Ebe, BCe, BCi, BCo);
+    vXNA(Ebi, BCi, BCo, BCu);
+    vXNA(Ebo, BCo, BCu, Aba);
+    vXNA(Ebu, BCu, Aba, BCe);
 
-        vxor(Abo, Abo, Do);
-        vROL(BCa, Abo, 28);
-        vxor(Agu, Agu, Du);
-        vROL(BCe, Agu, 20);
-        vxor(Aka, Aka, Da);
-        vROL(BCi, Aka, 3);
-        vxor(Ame, Ame, De);
-        vROL(BCo, Ame, 45);
-        vxor(Asi, Asi, Di);
-        vROL(BCu, Asi, 61);
-        vXNA(Ega, BCa, BCe, BCi);
-        vXNA(Ege, BCe, BCi, BCo);
-        vXNA(Egi, BCi, BCo, BCu);
-        vXNA(Ego, BCo, BCu, BCa);
-        vXNA(Egu, BCu, BCa, BCe);
+    vxor(Abo, Abo, Do);
+    vROL(BCa, Abo, 28);
+    vxor(Agu, Agu, Du);
+    vROL(BCe, Agu, 20);
+    vxor(Aka, Aka, Da);
+    vROL(BCi, Aka, 3);
+    vxor(Ame, Ame, De);
+    vROL(BCo, Ame, 45);
+    vxor(Asi, Asi, Di);
+    vROL(BCu, Asi, 61);
+    vXNA(Ega, BCa, BCe, BCi);
+    vXNA(Ege, BCe, BCi, BCo);
+    vXNA(Egi, BCi, BCo, BCu);
+    vXNA(Ego, BCo, BCu, BCa);
+    vXNA(Egu, BCu, BCa, BCe);
 
-        vxor(Abe, Abe, De);
-        vROL(BCa, Abe, 1);
-        vxor(Agi, Agi, Di);
-        vROL(BCe, Agi, 6);
-        vxor(Ako, Ako, Do);
-        vROL(BCi, Ako, 25);
-        vxor(Amu, Amu, Du);
-        vROL(BCo, Amu, 8);
-        vxor(Asa, Asa, Da);
-        vROL(BCu, Asa, 18);
-        vXNA(Eka, BCa, BCe, BCi);
-        vXNA(Eke, BCe, BCi, BCo);
-        vXNA(Eki, BCi, BCo, BCu);
-        vXNA(Eko, BCo, BCu, BCa);
-        vXNA(Eku, BCu, BCa, BCe);
+    vxor(Abe, Abe, De);
+    vROL(BCa, Abe, 1);
+    vxor(Agi, Agi, Di);
+    vROL(BCe, Agi, 6);
+    vxor(Ako, Ako, Do);
+    vROL(BCi, Ako, 25);
+    vxor(Amu, Amu, Du);
+    vROL(BCo, Amu, 8);
+    vxor(Asa, Asa, Da);
+    vROL(BCu, Asa, 18);
+    vXNA(Eka, BCa, BCe, BCi);
+    vXNA(Eke, BCe, BCi, BCo);
+    vXNA(Eki, BCi, BCo, BCu);
+    vXNA(Eko, BCo, BCu, BCa);
+    vXNA(Eku, BCu, BCa, BCe);
 
-        vxor(Abu, Abu, Du);
-        vROL(BCa, Abu, 27);
-        vxor(Aga, Aga, Da);
-        vROL(BCe, Aga, 36);
-        vxor(Ake, Ake, De);
-        vROL(BCi, Ake, 10);
-        vxor(Ami, Ami, Di);
-        vROL(BCo, Ami, 15);
-        vxor(Aso, Aso, Do);
-        vROL(BCu, Aso, 56);
-        vXNA(Ema, BCa, BCe, BCi);
-        vXNA(Eme, BCe, BCi, BCo);
-        vXNA(Emi, BCi, BCo, BCu);
-        vXNA(Emo, BCo, BCu, BCa);
-        vXNA(Emu, BCu, BCa, BCe);
+    vxor(Abu, Abu, Du);
+    vROL(BCa, Abu, 27);
+    vxor(Aga, Aga, Da);
+    vROL(BCe, Aga, 36);
+    vxor(Ake, Ake, De);
+    vROL(BCi, Ake, 10);
+    vxor(Ami, Ami, Di);
+    vROL(BCo, Ami, 15);
+    vxor(Aso, Aso, Do);
+    vROL(BCu, Aso, 56);
+    vXNA(Ema, BCa, BCe, BCi);
+    vXNA(Eme, BCe, BCi, BCo);
+    vXNA(Emi, BCi, BCo, BCu);
+    vXNA(Emo, BCo, BCu, BCa);
+    vXNA(Emu, BCu, BCa, BCe);
 
-        vxor(Abi, Abi, Di);
-        vROL(BCa, Abi, 62);
-        vxor(Ago, Ago, Do);
-        vROL(BCe, Ago, 55);
-        vxor(Aku, Aku, Du);
-        vROL(BCi, Aku, 39);
-        vxor(Ama, Ama, Da);
-        vROL(BCo, Ama, 41);
-        vxor(Ase, Ase, De);
-        vROL(BCu, Ase, 2);
-        vXNA(Esa, BCa, BCe, BCi);
-        vXNA(Ese, BCe, BCi, BCo);
-        vXNA(Esi, BCi, BCo, BCu);
-        vXNA(Eso, BCo, BCu, BCa);
-        vXNA(Esu, BCu, BCa, BCe);
+    vxor(Abi, Abi, Di);
+    vROL(BCa, Abi, 62);
+    vxor(Ago, Ago, Do);
+    vROL(BCe, Ago, 55);
+    vxor(Aku, Aku, Du);
+    vROL(BCi, Aku, 39);
+    vxor(Ama, Ama, Da);
+    vROL(BCo, Ama, 41);
+    vxor(Ase, Ase, De);
+    vROL(BCu, Ase, 2);
+    vXNA(Esa, BCa, BCe, BCi);
+    vXNA(Ese, BCe, BCi, BCo);
+    vXNA(Esi, BCi, BCo, BCu);
+    vXNA(Eso, BCo, BCu, BCa);
+    vXNA(Esu, BCu, BCa, BCe);
 
-        // Next Round
+    // Next Round
 
-        //    prepareTheta
-        vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
-        vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
-        vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
-        vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
-        vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
+    //    prepareTheta
+    vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
+    vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
+    vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
+    vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
+    vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
 
-        //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-        vROL(Da, BCe, 1);
-        vxor(Da, BCu, Da);
-        vROL(De, BCi, 1);
-        vxor(De, BCa, De);
-        vROL(Di, BCo, 1);
-        vxor(Di, BCe, Di);
-        vROL(Do, BCu, 1);
-        vxor(Do, BCi, Do);
-        vROL(Du, BCa, 1);
-        vxor(Du, BCo, Du);
+    //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+    vROL(Da, BCe, 1);
+    vxor(Da, BCu, Da);
+    vROL(De, BCi, 1);
+    vxor(De, BCa, De);
+    vROL(Di, BCo, 1);
+    vxor(Di, BCe, Di);
+    vROL(Do, BCu, 1);
+    vxor(Do, BCi, Do);
+    vROL(Du, BCa, 1);
+    vxor(Du, BCo, Du);
 
-        vxor(Eba, Eba, Da);
-        vxor(Ege, Ege, De);
-        vROL(BCe, Ege, 44);
-        vxor(Eki, Eki, Di);
-        vROL(BCi, Eki, 43);
-        vxor(Emo, Emo, Do);
-        vROL(BCo, Emo, 21);
-        vxor(Esu, Esu, Du);
-        vROL(BCu, Esu, 14);
-        vXNA(Aba, Eba, BCe, BCi);
-        vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
-        vXNA(Abe, BCe, BCi, BCo);
-        vXNA(Abi, BCi, BCo, BCu);
-        vXNA(Abo, BCo, BCu, Eba);
-        vXNA(Abu, BCu, Eba, BCe);
+    vxor(Eba, Eba, Da);
+    vxor(Ege, Ege, De);
+    vROL(BCe, Ege, 44);
+    vxor(Eki, Eki, Di);
+    vROL(BCi, Eki, 43);
+    vxor(Emo, Emo, Do);
+    vROL(BCo, Emo, 21);
+    vxor(Esu, Esu, Du);
+    vROL(BCu, Esu, 14);
+    vXNA(Aba, Eba, BCe, BCi);
+    vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
+    vXNA(Abe, BCe, BCi, BCo);
+    vXNA(Abi, BCi, BCo, BCu);
+    vXNA(Abo, BCo, BCu, Eba);
+    vXNA(Abu, BCu, Eba, BCe);
 
-        vxor(Ebo, Ebo, Do);
-        vROL(BCa, Ebo, 28);
-        vxor(Egu, Egu, Du);
-        vROL(BCe, Egu, 20);
-        vxor(Eka, Eka, Da);
-        vROL(BCi, Eka, 3);
-        vxor(Eme, Eme, De);
-        vROL(BCo, Eme, 45);
-        vxor(Esi, Esi, Di);
-        vROL(BCu, Esi, 61);
-        vXNA(Aga, BCa, BCe, BCi);
-        vXNA(Age, BCe, BCi, BCo);
-        vXNA(Agi, BCi, BCo, BCu);
-        vXNA(Ago, BCo, BCu, BCa);
-        vXNA(Agu, BCu, BCa, BCe);
+    vxor(Ebo, Ebo, Do);
+    vROL(BCa, Ebo, 28);
+    vxor(Egu, Egu, Du);
+    vROL(BCe, Egu, 20);
+    vxor(Eka, Eka, Da);
+    vROL(BCi, Eka, 3);
+    vxor(Eme, Eme, De);
+    vROL(BCo, Eme, 45);
+    vxor(Esi, Esi, Di);
+    vROL(BCu, Esi, 61);
+    vXNA(Aga, BCa, BCe, BCi);
+    vXNA(Age, BCe, BCi, BCo);
+    vXNA(Agi, BCi, BCo, BCu);
+    vXNA(Ago, BCo, BCu, BCa);
+    vXNA(Agu, BCu, BCa, BCe);
 
-        vxor(Ebe, Ebe, De);
-        vROL(BCa, Ebe, 1);
-        vxor(Egi, Egi, Di);
-        vROL(BCe, Egi, 6);
-        vxor(Eko, Eko, Do);
-        vROL(BCi, Eko, 25);
-        vxor(Emu, Emu, Du);
-        vROL(BCo, Emu, 8);
-        vxor(Esa, Esa, Da);
-        vROL(BCu, Esa, 18);
-        vXNA(Aka, BCa, BCe, BCi);
-        vXNA(Ake, BCe, BCi, BCo);
-        vXNA(Aki, BCi, BCo, BCu);
-        vXNA(Ako, BCo, BCu, BCa);
-        vXNA(Aku, BCu, BCa, BCe);
+    vxor(Ebe, Ebe, De);
+    vROL(BCa, Ebe, 1);
+    vxor(Egi, Egi, Di);
+    vROL(BCe, Egi, 6);
+    vxor(Eko, Eko, Do);
+    vROL(BCi, Eko, 25);
+    vxor(Emu, Emu, Du);
+    vROL(BCo, Emu, 8);
+    vxor(Esa, Esa, Da);
+    vROL(BCu, Esa, 18);
+    vXNA(Aka, BCa, BCe, BCi);
+    vXNA(Ake, BCe, BCi, BCo);
+    vXNA(Aki, BCi, BCo, BCu);
+    vXNA(Ako, BCo, BCu, BCa);
+    vXNA(Aku, BCu, BCa, BCe);
 
-        vxor(Ebu, Ebu, Du);
-        vROL(BCa, Ebu, 27);
-        vxor(Ega, Ega, Da);
-        vROL(BCe, Ega, 36);
-        vxor(Eke, Eke, De);
-        vROL(BCi, Eke, 10);
-        vxor(Emi, Emi, Di);
-        vROL(BCo, Emi, 15);
-        vxor(Eso, Eso, Do);
-        vROL(BCu, Eso, 56);
-        vXNA(Ama, BCa, BCe, BCi);
-        vXNA(Ame, BCe, BCi, BCo);
-        vXNA(Ami, BCi, BCo, BCu);
-        vXNA(Amo, BCo, BCu, BCa);
-        vXNA(Amu, BCu, BCa, BCe);
+    vxor(Ebu, Ebu, Du);
+    vROL(BCa, Ebu, 27);
+    vxor(Ega, Ega, Da);
+    vROL(BCe, Ega, 36);
+    vxor(Eke, Eke, De);
+    vROL(BCi, Eke, 10);
+    vxor(Emi, Emi, Di);
+    vROL(BCo, Emi, 15);
+    vxor(Eso, Eso, Do);
+    vROL(BCu, Eso, 56);
+    vXNA(Ama, BCa, BCe, BCi);
+    vXNA(Ame, BCe, BCi, BCo);
+    vXNA(Ami, BCi, BCo, BCu);
+    vXNA(Amo, BCo, BCu, BCa);
+    vXNA(Amu, BCu, BCa, BCe);
 
-        vxor(Ebi, Ebi, Di);
-        vROL(BCa, Ebi, 62);
-        vxor(Ego, Ego, Do);
-        vROL(BCe, Ego, 55);
-        vxor(Eku, Eku, Du);
-        vROL(BCi, Eku, 39);
-        vxor(Ema, Ema, Da);
-        vROL(BCo, Ema, 41);
-        vxor(Ese, Ese, De);
-        vROL(BCu, Ese, 2);
-        vXNA(Asa, BCa, BCe, BCi);
-        vXNA(Ase, BCe, BCi, BCo);
-        vXNA(Asi, BCi, BCo, BCu);
-        vXNA(Aso, BCo, BCu, BCa);
-        vXNA(Asu, BCu, BCa, BCe);
-    }
+    vxor(Ebi, Ebi, Di);
+    vROL(BCa, Ebi, 62);
+    vxor(Ego, Ego, Do);
+    vROL(BCe, Ego, 55);
+    vxor(Eku, Eku, Du);
+    vROL(BCi, Eku, 39);
+    vxor(Ema, Ema, Da);
+    vROL(BCo, Ema, 41);
+    vxor(Ese, Ese, De);
+    vROL(BCu, Ese, 2);
+    vXNA(Asa, BCa, BCe, BCi);
+    vXNA(Ase, BCe, BCi, BCo);
+    vXNA(Asi, BCi, BCo, BCu);
+    vXNA(Aso, BCo, BCu, BCa);
+    vXNA(Asu, BCu, BCa, BCe);
+  }
 
-    state[0] = Aba;
-    state[1] = Abe;
-    state[2] = Abi;
-    state[3] = Abo;
-    state[4] = Abu;
-    state[5] = Aga;
-    state[6] = Age;
-    state[7] = Agi;
-    state[8] = Ago;
-    state[9] = Agu;
-    state[10] = Aka;
-    state[11] = Ake;
-    state[12] = Aki;
-    state[13] = Ako;
-    state[14] = Aku;
-    state[15] = Ama;
-    state[16] = Ame;
-    state[17] = Ami;
-    state[18] = Amo;
-    state[19] = Amu;
-    state[20] = Asa;
-    state[21] = Ase;
-    state[22] = Asi;
-    state[23] = Aso;
-    state[24] = Asu;
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+#endif
 }
 
 /*************************************************
@@ -463,39 +504,41 @@
                             uint8_t *out1,
                             size_t nblocks,
                             unsigned int r,
-                            v128 s[25]) {
-    unsigned int i;
+                            v128 s[25]){
+  unsigned int i;
 
-    uint64x1_t a, b;
-    uint64x2x2_t a2, b2;
+  uint64x1_t a, b;
+  uint64x2x2_t a2, b2;
 
-    while (nblocks > 0) {
-        KeccakF1600_StatePermutex2(s);
+  while (nblocks > 0)
+  {
+    KeccakF1600_StatePermutex2(s);
 
-        for (i = 0; i < r / 8 - 1; i += 4) {
-            a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
-            b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
-            a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
-            b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
-            vst1q_u64_x2((uint64_t *)out0, a2);
-            vst1q_u64_x2((uint64_t *)out1, b2);
+    for (i = 0; i < r / 8 - 1; i += 4)
+    {
+      a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
+      b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
+      a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
+      b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
+      vst1q_u64_x2((uint64_t *)out0, a2);
+      vst1q_u64_x2((uint64_t *)out1, b2);
 
-            out0 += 32;
-            out1 += 32;
-        }
-
-        i = r / 8 - 1;
-        // Last iteration
-        a = vget_low_u64(s[i]);
-        b = vget_high_u64(s[i]);
-        vst1_u64((uint64_t *)out0, a);
-        vst1_u64((uint64_t *)out1, b);
-
-        out0 += 8;
-        out1 += 8;
-
-        --nblocks;
+      out0 += 32;
+      out1 += 32;
     }
+
+    i = r / 8 - 1;
+    // Last iteration
+    a = vget_low_u64(s[i]);
+    b = vget_high_u64(s[i]);
+    vst1_u64((uint64_t *)out0, a);
+    vst1_u64((uint64_t *)out1, b);
+
+    out0 += 8;
+    out1 += 8;
+
+    --nblocks;
+  }
 }
 
 /*************************************************
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.h
index 63a2bba..5d43f8a 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/fips202x2.h
@@ -1,3 +1,11 @@
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
 #ifndef FIPS202X2_H
 #define FIPS202X2_H
 
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/macros.inc b/src/sig/dilithium/pqclean_dilithium5_aarch64/macros.inc
index 66c1333..ef3af4c 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/macros.inc
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/macros.inc
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "macros_common.inc"
 
 .macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/macros_common.inc b/src/sig/dilithium/pqclean_dilithium5_aarch64/macros_common.inc
index df151bb..bd7e77e 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/macros_common.inc
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/macros_common.inc
@@ -1,3 +1,30 @@
+
+/*
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 // for ABI
 
 .macro push_all
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.c
index 27875b2..d8909dc 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "reduce.h"
 #include <stdint.h>
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.h
index 6f912fc..e6f5118 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/ntt.h
@@ -1,29 +1,61 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef NTT_H
 #define NTT_H
 #include "NTT_params.h"
 #include "params.h"
 #include <stdint.h>
 
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants);
 
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(int *des, const int *table, const int *_constants);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(int *des, const int *table, const int *_constants);
 
 #define NTT(in) { \
-        PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
-        PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \
     }
 
 #define iNTT(in) { \
-        PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
-        PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \
+        PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \
     }
 
 #define ntt DILITHIUM_NAMESPACE(ntt)
-void ntt(int32_t a[N]);
+void ntt(int32_t a[ARRAY_N]);
 #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont)
-void invntt_tomont(int32_t a[N]);
+void invntt_tomont(int32_t a[ARRAY_N]);
 
 static const int constants[16] = {
     Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime,
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.c
index a93b9d8..9ac5e36 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "packing.h"
 #include "params.h"
 #include "poly.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.h
index 5f49829..03f8933 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/packing.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PACKING_H
 #define PACKING_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/params.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/params.h
index b5d7521..a967fd4 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/params.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/params.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef PARAMS_H
 #define PARAMS_H
 
@@ -35,9 +42,17 @@
 #define POLYVECH_PACKEDBYTES (OMEGA + K)
 
 
+#if GAMMA1 == (1 << 17)
+#define POLYZ_PACKEDBYTES   576
+#elif GAMMA1 == (1 << 19)
 #define POLYZ_PACKEDBYTES   640
+#endif
 
+#if GAMMA2 == (DILITHIUM_Q-1)/88
+#define POLYW1_PACKEDBYTES  192
+#elif GAMMA2 == (DILITHIUM_Q-1)/32
 #define POLYW1_PACKEDBYTES  128
+#endif
 
 #define POLYETA_PACKEDBYTES  96
 
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.c
index 02f95e3..788bb14 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "poly.h"
 #include "reduce.h"
@@ -25,11 +57,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(int32_t *, const int32_t *);
 void poly_reduce(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -42,11 +74,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(int32_t *, const int32_t *);
 void poly_caddq(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -59,11 +91,11 @@
 *
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze(int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(int32_t *, const int32_t *);
 void poly_freeze(poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(a->coeffs, montgomery_const);
 
     DBENCH_STOP(*tred);
 }
@@ -173,11 +205,11 @@
 *              - const poly *a: pointer to first input polynomial
 *              - const poly *b: pointer to second input polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table);
 void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
+    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const);
 
     DBENCH_STOP(*tmul);
 }
@@ -194,11 +226,11 @@
 *              - poly *a0: pointer to output polynomial with coefficients c0
 *              - const poly *a: pointer to input polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *);
 void poly_power2round(poly *a1, poly *a0, const poly *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
+    PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs);
 
     DBENCH_STOP(*tround);
 }
@@ -706,11 +738,11 @@
 * Arguments:   - poly *r: pointer to output polynomial
 *              - const uint8_t *a: byte array with bit-packed polynomial
 **************************************************/
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32(int32_t *, const uint8_t *);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(int32_t *, const uint8_t *);
 void polyt1_unpack(poly *r, const uint8_t *a) {
     DBENCH_START();
 
-    PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32(r->coeffs, a);
+    PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(r->coeffs, a);
 
     DBENCH_STOP(*tpack);
 }
@@ -841,6 +873,29 @@
     uint32_t t[4];
     DBENCH_START();
 
+    #if GAMMA1 == (1 << 17)
+
+    for (i = 0; i < N / 4; ++i) {
+        t[0] = GAMMA1 - a->coeffs[4 * i + 0];
+        t[1] = GAMMA1 - a->coeffs[4 * i + 1];
+        t[2] = GAMMA1 - a->coeffs[4 * i + 2];
+        t[3] = GAMMA1 - a->coeffs[4 * i + 3];
+
+        r[9 * i + 0]  = t[0];
+        r[9 * i + 1]  = t[0] >> 8;
+        r[9 * i + 2]  = t[0] >> 16;
+        r[9 * i + 2] |= t[1] << 2;
+        r[9 * i + 3]  = t[1] >> 6;
+        r[9 * i + 4]  = t[1] >> 14;
+        r[9 * i + 4] |= t[2] << 4;
+        r[9 * i + 5]  = t[2] >> 4;
+        r[9 * i + 6]  = t[2] >> 12;
+        r[9 * i + 6] |= t[3] << 6;
+        r[9 * i + 7]  = t[3] >> 2;
+        r[9 * i + 8]  = t[3] >> 10;
+    }
+
+    #elif GAMMA1 == (1 << 19)
 
     for (i = 0; i < N / 2; ++i) {
         t[0] = GAMMA1 - a->coeffs[2 * i + 0];
@@ -854,6 +909,11 @@
         r[5 * i + 4]  = t[1] >> 12;
     }
 
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
@@ -871,6 +931,36 @@
     unsigned int i;
     DBENCH_START();
 
+    #if GAMMA1 == (1 << 17)
+
+    for (i = 0; i < N / 4; ++i) {
+        r->coeffs[4 * i + 0]  = a[9 * i + 0];
+        r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8;
+        r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16;
+        r->coeffs[4 * i + 0] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 1]  = a[9 * i + 2] >> 2;
+        r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6;
+        r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14;
+        r->coeffs[4 * i + 1] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 2]  = a[9 * i + 4] >> 4;
+        r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4;
+        r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12;
+        r->coeffs[4 * i + 2] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 3]  = a[9 * i + 6] >> 6;
+        r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2;
+        r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10;
+        r->coeffs[4 * i + 3] &= 0x3FFFF;
+
+        r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0];
+        r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1];
+        r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2];
+        r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3];
+    }
+
+    #elif GAMMA1 == (1 << 19)
 
     for (i = 0; i < N / 2; ++i) {
         r->coeffs[2 * i + 0]  = a[5 * i + 0];
@@ -887,6 +977,11 @@
         r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1];
     }
 
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
@@ -905,11 +1000,28 @@
     unsigned int i;
     DBENCH_START();
 
+    #if GAMMA2 == (DILITHIUM_Q-1)/88
+
+    for (i = 0; i < N / 4; ++i) {
+        r[3 * i + 0]  = a->coeffs[4 * i + 0];
+        r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6;
+        r[3 * i + 1]  = a->coeffs[4 * i + 1] >> 2;
+        r[3 * i + 1] |= a->coeffs[4 * i + 2] << 4;
+        r[3 * i + 2]  = a->coeffs[4 * i + 2] >> 4;
+        r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2;
+    }
+
+    #elif GAMMA2 == (DILITHIUM_Q-1)/32
 
     for (i = 0; i < N / 2; ++i) {
         r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
     }
 
+    #else
+
+#error "No parameter specified!"
+
+    #endif
 
     DBENCH_STOP(*tpack);
 }
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.h
index bad4e78..9f00fa6 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/poly.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef POLY_H
 #define POLY_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.c
index ffe6ee7..4496aaf 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "params.h"
 #include "poly.h"
 #include "polyvec.h"
@@ -146,11 +178,11 @@
 *              - const polyvecl *u: pointer to first input vector
 *              - const polyvecl *v: pointer to second input vector
 **************************************************/
-extern void PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
+extern void PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *);
 void polyvecl_pointwise_acc_montgomery(poly *w,
                                        const polyvecl *u,
                                        const polyvecl *v) {
-    PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
+    PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const);
 }
 
 /*************************************************
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.h
index 59d2d15..8fb7f73 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/polyvec.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef POLYVEC_H
 #define POLYVEC_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.c
index ab06800..4bf239a 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "params.h"
 #include "reduce.h"
 #include <stdint.h>
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.h
index c8bc606..8ca9a37 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/reduce.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef REDUCE_H
 #define REDUCE_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.c
index 15c846c..91c04d1 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.c
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #include "params.h"
 #include "rounding.h"
 #include <stdint.h>
@@ -40,10 +47,21 @@
     int32_t a1;
 
     a1  = (a + 127) >> 7;
+    #if GAMMA2 == (DILITHIUM_Q-1)/32
 
     a1  = (a1 * 1025 + (1 << 21)) >> 22;
     a1 &= 15;
 
+    #elif GAMMA2 == (DILITHIUM_Q-1)/88
+
+    a1  = (a1 * 11275 + (1 << 23)) >> 24;
+    a1 ^= ((43 - a1) >> 31) & a1;
+
+    #else
+
+#error "No parameter specified"
+
+    #endif
 
     *a0  = a - a1 * 2 * GAMMA2;
     *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q;
@@ -87,10 +105,18 @@
         return a1;
     }
 
+    #if GAMMA2 == (DILITHIUM_Q-1)/32
 
     if (a0 > 0) {
         return (a1 + 1) & 15;
     }
     return (a1 - 1) & 15;
+    #elif GAMMA2 == (DILITHIUM_Q-1)/88
+
+    if (a0 > 0) {
+        return (a1 == 43) ?  0 : a1 + 1;
+    }
+    return (a1 ==  0) ? 43 : a1 - 1;
+    #endif
 
 }
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.h
index ec60cee..a888737 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/rounding.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef ROUNDING_H
 #define ROUNDING_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.c
index e6c032d..a299d72 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "fips202.h"
 #include "packing.h"
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.h
index f577b11..fba1bf1 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/sign.h
@@ -1,3 +1,10 @@
+
+/*
+ * This file is dual licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * or public domain at https://github.com/pq-crystals/dilithium
+ */
+
 #ifndef SIGN_H
 #define SIGN_H
 #include "params.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric-shake.c b/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric-shake.c
index 878d655..a53074a 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric-shake.c
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric-shake.c
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "fips202.h"
 #include "params.h"
 #include "symmetric.h"
diff --git a/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric.h b/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric.h
index af3be4f..3739282 100644
--- a/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric.h
+++ b/src/sig/dilithium/pqclean_dilithium5_aarch64/symmetric.h
@@ -1,3 +1,35 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or
+ * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref
+ *
+ * We choose
+ * CC0 1.0 Universal or the following MIT License
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #ifndef SYMMETRIC_H
 #define SYMMETRIC_H
 #include "fips202.h"
diff --git a/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c b/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c
index e596eb7..c9c42b9 100644
--- a/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c
+++ b/src/sig/falcon/pqclean_falcon-1024_avx2/sign.c
@@ -1030,7 +1030,20 @@
      * On 32-bit systems, 'lo' really is two registers, requiring
      * some extra code.
      */
+#if defined(__x86_64__) || defined(_M_X64)
     xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
+#else
+    {
+        uint32_t e0, e1;
+        int32_t f0, f1;
+
+        e0 = (uint32_t)lo;
+        e1 = (uint32_t)(lo >> 32);
+        f0 = *(int32_t *)&e0;
+        f1 = *(int32_t *)&e1;
+        xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
+    }
+#endif
     gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo);
     gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo);
     gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo);
diff --git a/src/sig/falcon/pqclean_falcon-512_avx2/sign.c b/src/sig/falcon/pqclean_falcon-512_avx2/sign.c
index c8a9f9d..77ee5d3 100644
--- a/src/sig/falcon/pqclean_falcon-512_avx2/sign.c
+++ b/src/sig/falcon/pqclean_falcon-512_avx2/sign.c
@@ -1030,7 +1030,20 @@
      * On 32-bit systems, 'lo' really is two registers, requiring
      * some extra code.
      */
+#if defined(__x86_64__) || defined(_M_X64)
     xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
+#else
+    {
+        uint32_t e0, e1;
+        int32_t f0, f1;
+
+        e0 = (uint32_t)lo;
+        e1 = (uint32_t)(lo >> 32);
+        f0 = *(int32_t *)&e0;
+        f1 = *(int32_t *)&e1;
+        xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
+    }
+#endif
     gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo);
     gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo);
     gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo);
diff --git a/tests/constant_time/sig/passes/dilithium-aarch64 b/tests/constant_time/sig/passes/dilithium-aarch64
index a02dfa2..da337ea 100644
--- a/tests/constant_time/sig/passes/dilithium-aarch64
+++ b/tests/constant_time/sig/passes/dilithium-aarch64
@@ -29,20 +29,27 @@
    Rejection sampling for signature distribution
    Memcheck:Cond
    ...
-   src:sign.c:153 # Call to polyvecl_chknorm
+   src:sign.c:185 # Call to polyvecl_chknorm
    # fun:PQCLEAN_DILITHIUM*_AARCH64_crypto_sign_signature
 }
 {
    Rejection sampling for signature distribution
    Memcheck:Cond
    ...
-   src:sign.c:163 # Call to polyveck_chknorm
+   src:sign.c:195 # Call to polyveck_chknorm
+   # fun:PQCLEAN_DILITHIUM*_AARCH64_crypto_sign_signature
+}
+{
+   Rejection sampling for signature distribution
+   Memcheck:Cond
+   ...
+   src:sign.c:203 # Call to polyveck_chknorm
    # fun:PQCLEAN_DILITHIUM*_AARCH64_crypto_sign_signature
 }
 {
    Hint does not need to be computed in constant time
    Memcheck:Cond
    ...
-   src:sign.c:176 # Call to polyveck_make_hint
+   src:sign.c:208 # Call to polyveck_make_hint
    # fun:PQCLEAN_DILITHIUM*_AARCH64_crypto_sign_signature
 }