Add support for

VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r
VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r
VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r
VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r

VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r
VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r

VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib
VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib

(Jakub Jelinek, jakub@redhat.com), #273475 comments 121, 122, 124



git-svn-id: svn://svn.valgrind.org/vex/trunk@2394 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 6b06fa0..0288ed0 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -13753,6 +13753,77 @@
 }
 
 
+static Long dis_MOVSxDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta, Bool isAvx, Bool isL )
+{
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   IRTemp sV    = newTemp(Ity_V128);
+   UChar  modrm = getUChar(delta);
+   UInt   rG    = gregOfRexRM(pfx,modrm);
+   IRTemp s3, s2, s1, s0;
+   s3 = s2 = s1 = s0 = IRTemp_INVALID;
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx,modrm);
+      assign( sV, getXMMReg(rE) );
+      DIP("%smovs%cdup %s,%s\n",
+          isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
+      delta += 1;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      if (!isAvx)
+         gen_SEGV_if_not_16_aligned( addr );
+      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+      DIP("%smovs%cdup %s,%s\n",
+          isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
+      delta += alen;
+   }
+   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
+   (isAvx ? putYMMRegLoAndZU : putXMMReg)
+      ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
+                : mkV128from32s( s3, s3, s1, s1 ) );
+   return delta;
+}
+
+
+static Long dis_MOVSxDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta, Bool isL )
+{
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   IRTemp sV    = newTemp(Ity_V256);
+   IRTemp sVhi  = IRTemp_INVALID;
+   IRTemp sVlo  = IRTemp_INVALID;
+   UChar  modrm = getUChar(delta);
+   UInt   rG    = gregOfRexRM(pfx,modrm);
+   IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
+   s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx,modrm);
+      assign( sV, getYMMReg(rE) );
+      DIP("vmovs%cdup %s,%s\n",
+          isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
+      delta += 1;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+      DIP("vmovs%cdup %s,%s\n",
+          isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
+      delta += alen;
+   }
+   breakupV256toV128s( sV, &sVhi, &sVlo );
+   breakupV128to32s( sVhi, &s7, &s6, &s5, &s4 );
+   breakupV128to32s( sVlo, &s3, &s2, &s1, &s0 );
+   putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
+                                : mkV128from32s( s7, s7, s5, s5 ) );
+   putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
+                                : mkV128from32s( s3, s3, s1, s1 ) );
+   return delta;
+}
+
+
 __attribute__((noinline))
 static
 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
@@ -13775,28 +13846,8 @@
       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
          duplicating some lanes (2:2:0:0). */
       if (haveF3no66noF2(pfx) && sz == 4) {
-         IRTemp s3, s2, s1, s0;
-         IRTemp sV  = newTemp(Ity_V128);
-         s3 = s2 = s1 = s0 = IRTemp_INVALID;
-
-         modrm = getUChar(delta);
-         if (epartIsReg(modrm)) {
-            assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
-            DIP("movsldup %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
-                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
-            delta += 1;
-         } else {
-            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-            gen_SEGV_if_not_16_aligned( addr );
-            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
-            DIP("movsldup %s,%s\n", dis_buf,
-                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
-            delta += alen;
-         }
-
-         breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
-         putXMMReg( gregOfRexRM(pfx,modrm), 
-                    mkV128from32s( s2, s2, s0, s0 ) );
+         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
+                                   True/*isL*/ );
          goto decode_success;
       }
       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
@@ -13812,28 +13863,8 @@
       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
          duplicating some lanes (3:3:1:1). */
       if (haveF3no66noF2(pfx) && sz == 4) {
-         IRTemp s3, s2, s1, s0;
-         IRTemp sV  = newTemp(Ity_V128);
-         s3 = s2 = s1 = s0 = IRTemp_INVALID;
-
-         modrm = getUChar(delta);
-         if (epartIsReg(modrm)) {
-            assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
-            DIP("movshdup %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
-                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
-            delta += 1;
-         } else {
-            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-            gen_SEGV_if_not_16_aligned( addr );
-            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
-            DIP("movshdup %s,%s\n", dis_buf,
-                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
-            delta += alen;
-         }
-
-         breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
-         putXMMReg( gregOfRexRM(pfx,modrm), 
-                    mkV128from32s( s3, s3, s1, s1 ) );
+         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
+                                   False/*!isL*/ );
          goto decode_success;
       }
       break;
@@ -19961,11 +19992,11 @@
    shl = shr = sar = False;
    size = 0;
    switch (op) {
-      //case Iop_ShlN16x8: shl = True; size = 16; break;
+      case Iop_ShlN16x8: shl = True; size = 16; break;
       case Iop_ShlN32x4: shl = True; size = 32; break;
       case Iop_ShlN64x2: shl = True; size = 64; break;
       case Iop_SarN16x8: sar = True; size = 16; break;
-      //case Iop_SarN32x4: sar = True; size = 32; break;
+      case Iop_SarN32x4: sar = True; size = 32; break;
       case Iop_ShrN16x8: shr = True; size = 16; break;
       case Iop_ShrN32x4: shr = True; size = 32; break;
       case Iop_ShrN64x2: shr = True; size = 64; break;
@@ -20615,6 +20646,26 @@
          delta += alen;
          goto decode_success;
       }
+      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
+      /* Reg form. */
+      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         UInt  rE    = eregOfRexRM(pfx, modrm);
+         UInt  rV    = getVexNvvvv(pfx);
+         delta++;
+         DIP("vmovss %s,%s,%s\n",
+             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
+         IRTemp res = newTemp(Ity_V128);
+         assign( res, binop( Iop_64HLtoV128,
+                             getXMMRegLane64(rV, 1),
+                             binop(Iop_32HLto64,
+                                   getXMMRegLane32(rV, 1),
+                                   getXMMRegLane32(rE, 0)) ) );
+         putYMMRegLoAndZU(rG, mkexpr(res));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
          UChar modrm = getUChar(delta);
@@ -20728,6 +20779,26 @@
          delta += alen;
          goto decode_success;
       }
+      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
+      /* Reg form. */
+      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         UInt  rE    = eregOfRexRM(pfx, modrm);
+         UInt  rV    = getVexNvvvv(pfx);
+         delta++;
+         DIP("vmovss %s,%s,%s\n",
+             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
+         IRTemp res = newTemp(Ity_V128);
+         assign( res, binop( Iop_64HLtoV128,
+                             getXMMRegLane64(rV, 1),
+                             binop(Iop_32HLto64,
+                                   getXMMRegLane32(rV, 1),
+                                   getXMMRegLane32(rE, 0)) ) );
+         putYMMRegLoAndZU(rG, mkexpr(res));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
          UChar modrm = getUChar(delta);
@@ -20842,6 +20913,17 @@
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
+      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
+         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
+                                   True/*isL*/ );
+         goto decode_success;
+      }
+      /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
+      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x13:
@@ -21014,6 +21096,17 @@
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
+      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
+         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
+                                   False/*!isL*/ );
+         goto decode_success;
+      }
+      /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
+      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x17:
@@ -22097,6 +22190,7 @@
    case 0x71:
       /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
       /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
+      /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
       if (have66noF2noF3(pfx)
           && 0==getVexL(pfx)/*128*/
           && epartIsReg(getUChar(delta))) {
@@ -22112,28 +22206,41 @@
             *uses_vvvv = True;
             goto decode_success;
          }
+         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
+            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
+                                                "vpsllw", Iop_ShlN16x8 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
          /* else fall through */
       }
       break;
 
    case 0x72:
-      /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
       /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
+      /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
+      /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
       if (have66noF2noF3(pfx)
           && 0==getVexL(pfx)/*128*/
           && epartIsReg(getUChar(delta))) {
-         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
-            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
-                                                "vpslld", Iop_ShlN32x4 );
-            *uses_vvvv = True;
-            goto decode_success;
-         }
          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
                                                 "vpsrld", Iop_ShrN32x4 );
             *uses_vvvv = True;
             goto decode_success;
          }
+         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
+            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
+                                                "vpsrad", Iop_SarN32x4 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
+            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
+                                                "vpslld", Iop_ShlN32x4 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
          /* else fall through */
       }
       break;