Add a list of all the FP+SIMD insns, as a base from which to
generate test cases.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13918 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/none/tests/arm64/test_arm64_fp_and_simd.c b/none/tests/arm64/test_arm64_fp_and_simd.c
index 11a7732..a725518 100644
--- a/none/tests/arm64/test_arm64_fp_and_simd.c
+++ b/none/tests/arm64/test_arm64_fp_and_simd.c
@@ -1765,7 +1765,7 @@
    test_neg_4h_4h();
    test_neg_16b_16b();
    test_neg_8b_8b();
-   printf("END:   NEG (vector) (MISSING 8b/16b)\n\n");
+   printf("END:   NEG (vector)\n\n");
 
    printf("BEGIN: TBL, TBX\n");
    test_tbl_16b_1reg();
@@ -1788,3 +1788,716 @@
 
    return 0;
 }
+
+/*
+   abs      d
+   abs      2d,4s,2s,8h,4h,16b,8b
+   add      d
+   add      2d,4s,2s,8h,4h,16b,8b
+   addhn    2s.2d.2d, 4s.2d.2d, h_from_s and b_from_h (add and get high half)
+   addp     d (add pairs, across)
+   addp     2d,4s,2s,8h,4h,16b,8b
+   addv     4s,8h,4h,16b,18b (reduce across vector)
+   aesd     16b (aes single round decryption)
+   aese     16b (aes single round encryption)
+   aesimc   16b (aes inverse mix columns)
+   aesmc    16b (aes mix columns)
+   and      16b,8b
+
+   bic      4s,2s,8h,4h (vector, imm)
+   also movi, mvni, orr
+
+   bic      16b,8b (vector,reg) (bit clear)
+   bif      16b,8b (vector) (bit insert if false)
+   bit      16b,8b (vector) (bit insert if true)
+   bsl      16b,8b (vector) (bit select)
+
+   cls      4s,2s,8h,4h,16b,8b (count leading sign bits)
+   clz      4s,2s,8h,4h,16b,8b (count leading zero bits)
+
+   cmeq     d
+   cmeq     2d,4s,2s,8h,4h,16b,8b
+   cmeq_z   d
+   cmeq_z   2d,4s,2s,8h,4h,16b,8b
+
+   cmge     d
+   cmge     2d,4s,2s,8h,4h,16b,8b
+   cmge_z   d
+   cmge_z   2d,4s,2s,8h,4h,16b,8b
+
+   cmgt     d
+   cmgt     2d,4s,2s,8h,4h,16b,8b
+   cmgt_z   d
+   cmgt_z   2d,4s,2s,8h,4h,16b,8b
+
+   cmhi     d
+   cmhi     2d,4s,2s,8h,4h,16b,8b
+
+   cmhs     d
+   cmhs     2d,4s,2s,8h,4h,16b,8b
+
+   cmle_z   d
+   cmle_z   2d,4s,2s,8h,4h,16b,8b
+
+   cmlt_z   d
+   cmlt_z   2d,4s,2s,8h,4h,16b,8b
+
+   cmtst    d
+   cmtst    2d,4s,2s,8h,4h,16b,8b
+
+   cnt      16b,8b (population count per byte)
+
+   dup      d,s,h,b (vec elem to scalar)
+   dup      2d,4s,2s,8h,4h,16b,8b (vec elem to vector)
+   dup      2d,4s,2s,8h,4h,16b,8b (general reg to vector)
+
+   eor      16b,8b (vector)
+   ext      16b,8b,#imm4 (concat 2 vectors, then slice)
+
+   fabd     d,s
+   fabd     2d,4s,2s
+
+   fabs     d,s
+   fabs     2d,4s,2s
+
+   facge    s,d  (floating abs compare GE)
+   facge    2d,4s,2s
+
+   facgt    s,d  (floating abs compare GE)
+   facgt    2d,4s,2s
+
+   fadd     d,s
+   fadd     2d,4s,2s
+
+   faddp    d,s (floating add pair)
+   faddp    2d,4s,2s
+
+   fccmp    d,s (floating point conditional quiet compare)
+   fccmpe   d,s (floating point conditional signaling compare)
+
+   fcmeq    d,s
+   fcmeq    2d,4s,2s
+   fcmeq_z  d,s
+   fcmeq_z  2d,4s,2s
+
+   fcmge    d,s
+   fcmge    2d,4s,2s
+   fcmge_z  d,s
+   fcmge_z  2d,4s,2s
+
+   fcmgt    d,s
+   fcmgt    2d,4s,2s
+   fcmgt_z  d,s
+   fcmgt_z  2d,4s,2s
+
+   fcmle_z  d,s
+   fcmle_z  2d,4s,2s
+
+   fcmlt_z  d,s
+   fcmlt_z  2d,4s,2s
+
+   fcmp     d,s (floating point quiet, set flags)
+   fcmp_z   d,s
+   fcmpe    d,s (floating point signaling, set flags)
+   fcmpe_z  d,s
+
+   fcsel    d,s (fp cond select)
+
+   fcvt     s_h,d_h,h_s,d_s,h_d,s_d (fp convert, scalar)
+
+   fcvtas   d,s  (fcvt to signed int, nearest, ties away)
+   fcvtas   2d,4s,2s
+   fcvtas   w_s,x_s,w_d,x_d
+
+   fcvtau   d,s  (fcvt to unsigned int, nearest, ties away)
+   fcvtau   2d,4s,2s
+   fcvtau   w_s,x_s,w_d,x_d
+
+   fcvtl{2} 4s/4h, 4s/8h, 2d/2s, 2d/4s (float convert to longer form)
+
+   fcvtms   d,s  (fcvt to signed int, minus inf)
+   fcvtms   2d,4s,2s
+   fcvtms   w_s,x_s,w_d,x_d
+
+   fcvtmu   d,s  (fcvt to unsigned int, minus inf)
+   fcvtmu   2d,4s,2s
+   fcvtmu   w_s,x_s,w_d,x_d
+
+   fcvtn{2} 4h/4s, 8h/4s, 2s/2d, 4s/2d (float convert to narrower form)
+
+   fcvtns   d,s  (fcvt to signed int, nearest)
+   fcvtns   2d,4s,2s
+   fcvtns   w_s,x_s,w_d,x_d
+
+   fcvtnu   d,s  (fcvt to unsigned int, nearest)
+   fcvtnu   2d,4s,2s
+   fcvtnu   w_s,x_s,w_d,x_d
+
+   fcvtps   d,s  (fcvt to signed int, plus inf)
+   fcvtps   2d,4s,2s
+   fcvtps   w_s,x_s,w_d,x_d
+
+   fcvtpu   d,s  (fcvt to unsigned int, plus inf)
+   fcvtpu   2d,4s,2s
+   fcvtpu   w_s,x_s,w_d,x_d
+
+   fcvtxn   s_d (fcvt to lower prec narrow, rounding to odd)
+   fcvtxn   2s_2d,4s_2d
+
+   fcvtzs   s,d (fcvt to signed fixedpt, to zero) (w/ #fbits)
+   fcvtzs   2d,4s,2s
+
+   fcvtzs   s,d (fcvt to signed integer, to zero)
+   fcvtzs   2d,4s,2s
+
+   fcvtzs   w_s,x_s,w_d,x_d (fcvt to signed fixedpt, to zero) (w/ #fbits)
+
+   fcvtzs   w_s,x_s,w_d,x_d (fcvt to signed integer, to zero)
+
+   fcvtzu   s,d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
+   fcvtzu   2d,4s,2s
+
+   fcvtzu   s,d (fcvt to unsigned integer, to zero)
+   fcvtzu   2d,4s,2s
+
+   fcvtzu   w_s,x_s,w_d,x_d (fcvt to unsigned fixedpt, to zero) (w/ #fbits)
+
+   fcvtzu   w_s,x_s,w_d,x_d (fcvt to unsigned integer, to zero)
+
+   fdiv     d,s
+   fdiv     2d,4s,2s
+
+   fmadd    d,s
+   fnmadd   d,s
+   fnmsub   d,s
+   fnmul    d,s
+
+   fmax     d,s
+   fmin     d,s
+
+   fmax     2d,4s,2s
+   fmin     2d,4s,2s
+
+   fmaxnm   d,s ("max number")
+   fminnm   d,s
+
+   fmaxnm   2d,4s,2s
+   fminnm   2d,4s,2s
+
+   fmaxnmp  d_2d,s_2s ("max number pairwise")
+   fminnmp  d_2d,s_2s
+
+   fmaxnmp  2d,4s,2s
+   fminnmp  2d,4s,2s
+
+   fmaxnmv  s_4s (maxnum across vector)
+   fminnmv  s_4s
+
+   fmaxp    d_2d,s_2s (max of a pair)
+   fminp    d_2d,s_2s (max of a pair)
+
+   fmaxp    2d,4s,2s  (max pairwise)
+   fminp    2d,4s,2s
+
+   fmaxv    s_4s (max across vector)
+   fminv    s_4s
+
+   fmla     d_d_d[],s_s_s[] (by element)
+   fmla     2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+   fmla     2d,4s,2s
+
+   fmls     d_d_d[],s_s_s[] (by element)
+   fmls     2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+   fmls     2d,4s,2s
+
+   fmov     2d,4s,2s #imm (part of the MOVI/MVNI/ORR/BIC imm group)
+
+   fmov     d_d,s_s
+
+   fmov     s_w,w_s,d_x,d[1]_x,x_d,x_d[1]
+
+   fmov     d,s #imm
+
+   fmsub    d,s
+
+   fmul     d_d_d[],s_s_s[]
+   fmul     2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+   fmul     2d,4s,2s
+   fmul     d,s
+
+   fmulx    d_d_d[],s_s_s[]
+   fmulx    2d_2d_d[],4s_4s_s[],2s_2s_s[]
+
+   fmulx    d,s
+   fmulx    2d,4s,2s
+
+   fneg     d,s
+   fneg     2d,4s,2s
+
+   frecpe   d,s (recip estimate)
+   frecpe   2d,4s,2s
+
+   frecps   d,s (recip step)
+   frecps   2d,4s,2s
+
+   frecpx   d,s (recip exponent)
+
+   frinta   2d,4s,2s (round to integral, nearest away)
+   frinta   d,s
+
+   frinti   2d,4s,2s (round to integral, per FPCR)
+   frinti   d,s
+
+   frintm   2d,4s,2s (round to integral, minus inf)
+   frintm   d,s
+
+   frintn   2d,4s,2s (round to integral, nearest, to even)
+   frintn   d,s
+
+   frintp   2d,4s,2s (round to integral, plus inf)
+   frintp   d,s
+
+   frintx   2d,4s,2s (round to integral exact, per FPCR)
+   frintx   d,s
+
+   frintz   2d,4s,2s (round to integral, zero)
+   frintz   d,s
+
+   frsqrte  d,s (est)
+   frsqrte  2d,4s,2s
+
+   frsqrts  d,s (step)
+   frsqrts  2d,4s,2s
+
+   fsqrt    d,s
+   fsqrt    2d,4s,2s
+
+   fsub     d,s
+   fsub     2d,4s,2s
+
+   ins      d[]_d[],s[]_s[],h[]_h[],b[]_b[]
+
+   ins      d[]_x, s[]_w, h[]_w, b[]_w
+
+   ld1  (multiple 1-element structures to 1/2/3/4 regs)
+   ld1  (single 1-element structure to one lane of 1 reg)
+   ld1r (single 1-element structure and rep to all lanes of 1 reg)
+
+   ld2  (multiple 2-element structures to 2 regs)
+   ld2  (single 2-element structure to one lane of 2 regs)
+   ld2r (single 2-element structure and rep to all lanes of 2 regs)
+
+   ld3  (multiple 3-element structures to 3 regs)
+   ld3  (single 3-element structure to one lane of 3 regs)
+   ld3r (single 3-element structure and rep to all lanes of 3 regs)
+
+   ld4  (multiple 4-element structures to 4 regs)
+   ld4  (single 4-element structure to one lane of 4 regs)
+   ld4r (single 4-element structure and rep to all lanes of 4 regs)
+
+   ldnp  q_q_addr,d_d_addr,s_s_addr  (load pair w/ non-temporal hint)
+         addr = reg + uimm7 * reg_size
+
+   ldp   q_q_addr,d_d_addr,s_s_addr  (load pair)
+         addr = [Xn|SP],#imm   or [Xn|SP,#imm]!  or [Xn|SP,#imm]
+
+   ldr   q,d,s,h,b from addr
+         addr = [Xn|SP],#imm   or [Xn|SP,#imm]!  or [Xn|SP,#imm]
+
+   ldr   q,d,s from  pc+#imm19
+
+   ldr   q,d,s,h,b from addr
+         addr = [Xn|SP, R <extend> <shift]
+
+   ldur  q,d,s,h,b from addr
+         addr = [Xn|SP,#imm] (unscaled offset)
+
+   mla   4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+   mla   4s,2s,8h,4h,16b,8b
+
+   mls   4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+   mls   4s,2s,8h,4h,16b,8b
+
+   movi  16b,8b   #imm8, LSL #0
+   movi  8h,4h    #imm8, LSL #0 or 8
+   movi  4s,2s    #imm8, LSL #0, 8, 16, 24
+   movi  4s,2s    #imm8, MSL #8 or 16
+   movi  d,       #imm64
+   movi  2d,      #imm64
+
+   mul   4s_4s_s[],2s_2s_s[],8h_8h_h[],4h_4h_h[]
+   mul   4s,2s,8h,4h,16b,8b
+
+   mvni  8h,4h    #imm8, LSL #0 or 8
+   mvni  4s,2s    #imm8, LSL #0, 8, 16, 24
+   mvni  4s,2s    #imm8, MSL #8 or 16
+
+   neg   d
+   neg   2d,4s,2s,8h,4h,16b,8b
+
+   not   16b,8b
+
+   orn   16b,8b
+
+   orr   8h,4h   #imm8, LSL #0 or 8
+   orr   4s,2s   #imm8, LSL #0, 8, 16 or 24
+
+   orr   16b,8b
+
+   pmul  16b,8b
+
+   pmull{2}  8h_8b_8b,8h_16b_16b,1q_1d_1d,1d_2d_2d
+
+   raddhn{2}  2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+
+   rbit    16b,8b
+   rev16   16b,8b
+   rev32   16b,8b,8h,4h
+   rev64   16b,8b,8h,4h,4s,2s
+
+   rshrn{2}  2s/4s_2d, 8h/4h_4s, 2s/4s_2d,   #imm in 1 .. elem_bits
+
+   rsubhn{2}  2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+
+   saba      16b,8b,8h,4h,4s,2s
+   sabal{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   sabd      16b,8b,8h,4h,4s,2s
+   sabdl{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   sadalp    4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+   saddl{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   saddlp    4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+   saddlv    h_16b/8b, s_8h/4h, d_4s
+
+   saddw{2}  8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+
+   scvtf     d,s        _#fbits
+   scvtf     2d,4s,2s   _#fbits
+
+   scvtf     d,s
+   scvtf     2d,4s,2s
+
+   scvtf     s_w, d_w, s_x, d_x,   _#fbits
+   scvtf     s_w, d_w, s_x, d_x
+
+   sha1c       q_s_4s
+   sha1h       s_s
+   sha1m       q_s_4s
+   sha1p       q_s_4s
+   sha1su0     4s_4s_4s
+   sha1su1     4s_4s
+   sha256h2    q_q_4s
+   sha256h     q_q_4s
+   sha256su0   4s_4s
+   sha256su1   4s_4s_4s
+
+   shadd       16b,8b,8h,4h,4s,2s
+
+   shl         d_#imm
+   shl         16b,8b,8h,4h,4s,2s,2d  _#imm
+
+   shll{2}   8h_8b/16b_#8, 4s_4h/8h_#16, 2d_2s/4s_#32
+
+   shrn{2}  2s/4s_2d, 8h/4h_4s, 2s/4s_2d,   #imm in 1 .. elem_bits
+
+   shsub       16b,8b,8h,4h,4s,2s
+
+   sli         d_#imm
+   sli         2d,4s,2s,8h,4h,16b,8b  _#imm
+
+   smax        4s,2s,8h,4h,16b,8b
+
+   smaxp       4s,2s,8h,4h,16b,8b
+
+   smaxv       s_4s,h_8h,h_4h,b_16b,b_8b
+
+   smin        4s,2s,8h,4h,16b,8b
+
+   sminp       4s,2s,8h,4h,16b,8b
+
+   sminv       s_4s,h_8h,h_4h,b_16b,b_8b
+
+   smlal{2}    2d_2s/4s_s[], 4s_4h/8h_h[]
+   smlal{2}    2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   smlsl{2}    2d_2s/4s_s[], 4s_4h/8h_h[]
+   smlsl{2}    2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   smov        w_b[], w_h[], x_b[], x_h[], x_s[]
+
+   smull{2}    2d_2s/4s_s[]. 4s_4h/8h_h[]
+   smull{2}    2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   sqabs       d,s,h,b
+   sqabs       2d,4s,2s,8h,4h,16b,8b
+
+   sqadd       d,s,h,b
+   sqadd       2d,4s,2s,8h,4h,16b,8b
+
+   sqdmlal     d_s_s[], s_h_h[]
+   sqdmlal{2}  2d_2s/4s_s[], 4s_4h/8h_h[]
+
+   sqdmlal     d_s_s, s_h_h
+   sqdmlal{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+
+   sqdmlsl     d_s_s[], s_h_h[]
+   sqdmlsl{2}  2d_2s/4s_s[], 4s_4h/8h_h[]
+
+   sqdmlsl     d_s_s, s_h_h
+   sqdmlsl{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+
+   sqdmulh     s_s_s[], h_h_h[]
+   sqdmulh     4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
+
+   sqdmulh     h,s
+   sqdmulh     4s,2s,8h,4h
+
+   sqdmull     d_s_s[], s_h_h[]
+   sqdmull{2}  2d_2s/4s_s[], 4s_4h/2h_h[]
+
+   sqdmull     d_s_s,s_h_h
+   sqdmull{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)
+
+   sqneg       d,s,h,b
+   sqneg       2d,4s,2s,8h,4h,16b,8b
+
+   sqrdmulh    s_s_s[], h_h_h[]
+   sqrdmulh    4s_4s_s[], 2s_2s_s[], 8h_8h_h[], 4h_4h_h[]
+
+   sqrdmulh    h,s
+   sqrdmulh    4s,2s,8h,4h
+
+   sqrshl      d,s,h,b
+   sqrshl      2d,4s,2s,8h,4h,16b,8b
+
+   sqrshrn     s_d, h_s, b_h   #imm
+   sqrshrn{2}  2s/4s_2d, 4h/8h_4s, 8b/16b_8h,  #imm
+
+   sqrshrun     s_d, h_s, b_h   #imm
+   sqrshrun{2}  2s/4s_2d, 4h/8h_4s, 8b/16b_8h,  #imm
+
+   sqshl        d,s,h,b   _#imm
+   sqshl        2d,4s,2s,8h,4h,16b,8b   _#imm
+
+   sqshl        d,s,h,b
+   sqshl        2d,4s,2s,8h,4h,16b,8b
+
+   sqshlu       d,s,h,b  _#imm
+   sqshlu       2d,4s,2s,8h,4h,16b,8b  _#imm
+
+   sqshrn       s_d, h_s, b_h   #imm
+   sqshrn{2}    2s/4s_2d, 4h/8h_4s, 8b/16b_8h,  #imm
+
+   sqshrun      s_d, h_s, b_h   #imm
+   sqshrun{2}   2s/4s_2d, 4h/8h_4s, 8b/16b_8h,  #imm
+
+   sqsub       d,s,h,b
+   sqsub       2d,4s,2s,8h,4h,16b,8b
+
+   sqxtn       s_d,h_s,b_h
+   sqxtn{2}    2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+
+   sqxtun      s_d,h_s,b_h
+   sqxtun{2}   2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+
+   srhadd      4s,2s,8h,4h,16b,8b
+
+   sri         d_#imm
+   sri         2d,4s,2s,8h,4h,16b,8b  _#imm
+
+   srshl (reg) d
+   srshl       2d,4s,2s,8h,4h,16b,8b
+
+   srshr (imm) d
+   srshr       2d,4s,2s,8h,4h,16b,8b
+
+   srsra (imm) d
+   srsra       2d,4s,2s,8h,4h,16b,8b
+
+   sshl (reg)  d
+   sshl        2d,4s,2s,8h,4h,16b,8b
+
+   sshll{2} (imm)  2d_2s/4s  4s_4h/8h, 8h_8b/16b
+
+   sshr (imm)  d
+   sshr        2d,4s,2s,8h,4h,16b,8b
+
+   ssra (imm)  d
+   ssra        2d,4s,2s,8h,4h,16b,8b
+
+   ssubl{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   ssubw{2}  8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+
+   st1 (multiple 1-element structures from 1/2/3/4 regs)
+   st1 (single 1-element structure for 1 lane of 1 reg)
+
+   st2 (multiple 2-element structures from 2 regs)
+   st2 (single 2-element structure from 1 lane of 2 regs)
+
+   st3 (multiple 3-element structures from 3 regs)
+   st3 (single 3-element structure from 1 lane of 3 regs)
+
+   st4 (multiple 4-element structures from 4 regs)
+   st4 (single 4-element structure from one lane of 4 regs)
+
+   stnp q_q_addr, d_d_addr, s_s_addr
+        addr = [Xn|SP, #imm]
+
+   stp  q_q_addr, d_d_addr, s_s_addr
+        addr = [Xn|SP], #imm  or [Xn|SP, #imm]!  or [Xn|SP, #imm]
+
+   str  q,d,s,h,b_addr
+        addr = [Xn|SP], #simm  or [Xn|SP, #simm]!  or [Xn|SP, #pimm]
+
+   str   q,d,s,h,b_addr
+         addr = [Xn|SP, R <extend> <shift]
+
+   stur  q,d,s,h,b_addr
+         addr = [Xn|SP,#imm] (unscaled offset)
+
+   sub   d
+   sub   2d,4s,2s,8h,4h,16b,8b
+
+   subhn{2}  2s/4s_2d_2d, 4h/8h_4s_4s, 8b/16b_8h_8h
+
+   suqadd  d,s,h,b
+   suqadd  2d,4s,2s,8h,4h,16b,8b
+
+   tbl     8b_{16b}_8b, 16b_{16b}_16b
+   tbl     8b_{16b,16b}_8b, 16b_{16b,16b}_16b
+   tbl     8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
+   tbl     8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
+
+   tbx     8b_{16b}_8b, 16b_{16b}_16b
+   tbx     8b_{16b,16b}_8b, 16b_{16b,16b}_16b
+   tbx     8b_{16b,16b,16b}_8b, 16b_{16b,16b,16b}_16b
+   tbx     8b_{16b,16b,16b,16b}_8b, 16b_{16b,16b,16b,16b}_16b
+
+   trn1    2d,4s,2s,8h,4h,16b,8b
+   trn2    2d,4s,2s,8h,4h,16b,8b
+
+   uaba      16b,8b,8h,4h,4s,2s
+   uabal{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   uabd      16b,8b,8h,4h,4s,2s
+   uabdl{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   uadalp    4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+   uaddl{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   uaddlp    4h_8b,8h_16b,2s_4h,4s_8h,1d_2s,2d_4s
+
+   uaddlv    h_16b/8b, s_8h/4h, d_4s
+
+   uaddw{2}  8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+
+   ucvtf     d,s        _#fbits
+   ucvtf     2d,4s,2s   _#fbits
+
+   ucvtf     d,s
+   ucvtf     2d,4s,2s
+
+   ucvtf     s_w, d_w, s_x, d_x,   _#fbits
+   ucvtf     s_w, d_w, s_x, d_x
+
+   uhadd       16b,8b,8h,4h,4s,2s
+
+   uhsub       16b,8b,8h,4h,4s,2s
+
+   umax        4s,2s,8h,4h,16b,8b
+
+   umaxp       4s,2s,8h,4h,16b,8b
+
+   umaxv       s_4s,h_8h,h_4h,b_16b,b_8b
+
+   umin        4s,2s,8h,4h,16b,8b
+
+   uminp       4s,2s,8h,4h,16b,8b
+
+   uminv       s_4s,h_8h,h_4h,b_16b,b_8b
+
+   umlal{2}    2d_2s/4s_s[], 4s_4h/8h_h[]
+   umlal{2}    2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   umlsl{2}    2d_2s/4s_s[], 4s_4h/8h_h[]
+   umlsl{2}    2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   umov        w_b[], w_h[], x_b[], x_h[], x_s[]
+
+   umull{2}    2d_2s/4s_s[]. 4s_4h/8h_h[]
+   umull{2}    2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   uqadd       d,s,h,b
+   uqadd       2d,4s,2s,8h,4h,16b,8b
+
+   uqrshl      d,s,h,b
+   uqrshl      2d,4s,2s,8h,4h,16b,8b
+
+   uqrshrn     s_d, h_s, b_h   #imm
+   uqrshrn{2}  2s/4s_2d, 4h/8h_4s, 8b/16b_8h,  #imm
+
+   uqshl        d,s,h,b   _#imm
+   uqshl        2d,4s,2s,8h,4h,16b,8b   _#imm
+
+   uqshl        d,s,h,b
+   uqshl        2d,4s,2s,8h,4h,16b,8b
+
+   uqshrn       s_d, h_s, b_h   #imm
+   uqshrn{2}    2s/4s_2d, 4h/8h_4s, 8b/16b_8h,  #imm
+
+   uqsub       d,s,h,b
+   uqsub       2d,4s,2s,8h,4h,16b,8b
+
+   uqxtn       s_d,h_s,b_h
+   uqxtn{2}    2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+
+   urecpe      4s,2s
+
+   urhadd      4s,2s,8h,4h,16b,8b
+
+   urshl (reg) d
+   urshl       2d,4s,2s,8h,4h,16b,8b
+
+   urshr (imm) d
+   urshr       2d,4s,2s,8h,4h,16b,8b
+
+   ursqrte     4s,2s
+
+   ursra (imm) d
+   ursra       2d,4s,2s,8h,4h,16b,8b
+
+   ushl (reg)  d
+   ushl        2d,4s,2s,8h,4h,16b,8b
+
+   ushll{2} (imm)  2d_2s/4s  4s_4h/8h, 8h_8b/16b
+
+   ushr (imm)  d
+   ushr        2d,4s,2s,8h,4h,16b,8b
+
+   usqadd      d,s,h,b
+   usqadd      2d,4s,2s,8h,4h,16b,8b
+
+   usra (imm)  d
+   usra        2d,4s,2s,8h,4h,16b,8b
+
+   usubl{2}  2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h), 8h_(8b_8b)/(16b_16b)
+
+   usubw{2}  8h_8h_16b/8b, 4s_4s_8h/4h, 2d_2d_2s/4s
+
+   uzp1      2d,4s,2s,8h,4h,16b,8b
+   uzp2      2d,4s,2s,8h,4h,16b,8b
+
+   xtn{2}    2s/4s_2d, 4h/8h_4s, 8b/16b_8h
+
+   zip1      2d,4s,2s,8h,4h,16b,8b
+   zip2      2d,4s,2s,8h,4h,16b,8b
+*/