gallivm: optimize gather a bit, by using supplied destination type

By using a dst_type in the the gather interface, gather has some more
knowledge about how values should be fetched.
E.g. if this is a 3x32bit fetch and dst_type is 4x32bit vector gather
will no longer do a ZExt with a 96bit scalar value to 128bit, but
just fetch the 96bit as 3x32bit vector (this is still going to be
2 loads of course, but the loads can be done directly to simd vector
that way).
Also, we can now do some try to use the right int/float type. This should
make no difference really since there's typically no domain transition
penalties for such simd loads, however it actually makes a difference
since llvm will use different shuffle lowering afterwards so the caller
can use this to trick llvm into using sane shuffle afterwards (and yes
llvm is really stupid there - nothing against using the shuffle
instruction from the correct domain, but not at the cost of doing 3 times
more shuffles, the case which actually matters is refusal to use shufps
for integer values).
Also do some attempt to avoid things which look great on paper but llvm
doesn't really handle (e.g. fetching 3-element 8 bit and 16 bit vectors
which is simply disastrous - I suspect type legalizer is to blame trying
to extend these vectors to 128bit types somehow, so fetching these with
scalars like before which is suboptimal due to the ZExt).

Remove the ability for truncation (no point, this is gather, not conversion)
as it is complex enough already.

While here also implement not just the float, but also the 64bit avx2
gathers (disabled though since based on the theoretical numbers the benefit
just isn't there at all until Skylake at least).

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index c548572..19b75a5 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1864,7 +1864,7 @@
                                        LLVMPointerType(LLVMInt8TypeInContext(context),
                                                        0), "");
          tmp = lp_build_gather(gallivm, vs_type.length,
-                               32, 32, TRUE,
+                               32, bld.type, TRUE,
                                fetch_elts, tmp, FALSE);
          LLVMBuildStore(builder, tmp, index_store);
       }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 9f6b9e9..322e7b8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -224,6 +224,7 @@
    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
     * into masked = {X, Y, Z, W}
     */
+   /* Note: we cannot do this shift on x86 natively until AVX2. */
    shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
    masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
 
@@ -394,6 +395,7 @@
        util_is_power_of_two(format_desc->block.bits)) {
       LLVMValueRef packed;
       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+      struct lp_type fetch_type;
       unsigned vec_len = type.width * type.length;
 
       /*
@@ -401,8 +403,9 @@
        * scaling or converting.
        */
 
+      fetch_type = lp_type_uint(type.width*4);
       packed = lp_build_gather(gallivm, type.length/4,
-                               format_desc->block.bits, type.width*4,
+                               format_desc->block.bits, fetch_type,
                                aligned, base_ptr, offset, TRUE);
 
       assert(format_desc->block.bits <= vec_len);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
index 8cad3a6..636a4a6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -70,7 +70,14 @@
 
    src_vec_type  = lp_build_vec_type(gallivm,  src_type);
 
-   /* Read whole vector from memory, unaligned */
+   /*
+    * Read whole vector from memory, unaligned.
+    * XXX: Note it's actually aligned to element type. Not sure if all
+    * callers are able to guarantee that (whereas for others, we should
+    * be able to use full alignment when there's 2 or 4 channels).
+    * (If all callers can guarantee element type alignment, we should
+    * relax alignment restrictions elsewhere.)
+    */
    ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
    ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
    res = LLVMBuildLoad(builder, ptr, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index a48d71f..b3bc155 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -405,6 +405,8 @@
                         LLVMValueRef rgba_out[4])
 {
    LLVMBuilderRef builder = gallivm->builder;
+   enum pipe_format format = format_desc->format;
+   struct lp_type fetch_type;
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -430,10 +432,11 @@
        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
        */
       assert(format_desc->block.bits <= type.width);
+      fetch_type = lp_type_uint(type.width);
       packed = lp_build_gather(gallivm,
                                type.length,
                                format_desc->block.bits,
-                               type.width,
+                               fetch_type,
                                aligned,
                                base_ptr, offset, FALSE);
 
@@ -447,22 +450,23 @@
       return;
    }
 
-   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
-       format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
       /*
        * similar conceptually to above but requiring special
        * AoS packed -> SoA float conversion code.
        */
       LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
 
       assert(type.floating);
       assert(type.width == 32);
 
       packed = lp_build_gather(gallivm, type.length,
                                format_desc->block.bits,
-                               type.width, aligned,
+                               fetch_type, aligned,
                                base_ptr, offset, FALSE);
-      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
       }
       else {
@@ -478,8 +482,9 @@
        * 32bit (or 8bit) from each block.
        */
       LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
 
-      if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
          /*
           * for stencil simply fix up offsets - could in fact change
           * base_ptr instead even outside the shader.
@@ -487,14 +492,14 @@
          unsigned mask = (1 << 8) - 1;
          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
          offset = LLVMBuildAdd(builder, offset, s_offset, "");
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                   aligned, base_ptr, offset, FALSE);
          packed = LLVMBuildAnd(builder, packed,
                                lp_build_const_int_vec(gallivm, type, mask), "");
       }
       else {
-         assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                   aligned, base_ptr, offset, TRUE);
          packed = LLVMBuildBitCast(builder, packed,
                                    lp_build_vec_type(gallivm, type), "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index fa0e8b6..d6d7552 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -491,13 +491,15 @@
 {
    LLVMValueRef packed;
    LLVMValueRef rgba;
+   struct lp_type fetch_type;
 
    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
    assert(format_desc->block.bits == 32);
    assert(format_desc->block.width == 2);
    assert(format_desc->block.height == 1);
 
-   packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);
+   fetch_type = lp_type_uint(32);
+   packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE);
 
    (void)j;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 1f7ba92..7654ba0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -28,6 +28,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_cpu_detect.h"
+#include "util/u_math.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_const.h"
 #include "lp_bld_format.h"
@@ -36,6 +37,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_init.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_pack.h"
 
 
 /**
@@ -114,14 +116,29 @@
     * translation of offsets to first_elem in sampler_views it actually seems
     * gallium could not do anything else except 16 no matter what...
     */
-  if (!aligned) {
+   if (!aligned) {
       LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
    }
 
    assert(src_width <= dst_width);
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
-   } else if (src_width < dst_width) {
+   if (src_width < dst_width) {
       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
       if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -135,28 +152,134 @@
 }
 
 
+/**
+ * Gather one element from scatter positions in memory.
+ * Nearly the same as above, however the individual elements
+ * may be vectors themselves, and fetches may be float type.
+ * Can also do pad vector instead of ZExt.
+ *
+ * @sa lp_build_gather()
+ */
+static LLVMValueRef
+lp_build_gather_elem_vec(struct gallivm_state *gallivm,
+                         unsigned length,
+                         unsigned src_width,
+                         LLVMTypeRef src_type,
+                         struct lp_type dst_type,
+                         boolean aligned,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i,
+                         boolean vector_justify)
+{
+   LLVMValueRef ptr, res;
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
+
+   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(gallivm->builder, ptr, "");
+
+   /* XXX
+    * On some archs we probably really want to avoid having to deal
+    * with alignments lower than 4 bytes (if fetch size is a power of
+    * two >= 32). On x86 it doesn't matter, however.
+    * We should be able to guarantee full alignment for any kind of texture
+    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+    * but I don't think that's quite what we wanted).
+    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+    * enforcing what we want (which is what d3d10 does, the offset needs to
+    * be aligned to element size, but GL has bytes regardless of element
+    * size which would only leave us with minimum alignment restriction of 16
+    * which doesn't make much sense if the type isn't 4x32bit). Due to
+    * translation of offsets to first_elem in sampler_views it actually seems
+    * gallium could not do anything else except 16 no matter what...
+    */
+   if (!aligned) {
+      LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
+   }
+
+   assert(src_width <= dst_type.width * dst_type.length);
+   if (src_width < dst_type.width * dst_type.length) {
+      if (dst_type.length > 1) {
+         res = lp_build_pad_vector(gallivm, res, dst_type.length);
+         /*
+          * vector_justify hopefully a non-issue since we only deal
+          * with src_width >= 32 here?
+          */
+      } else {
+         /*
+          * Only valid if src_ptr_type is int type...
+          */
+         res = LLVMBuildZExt(gallivm->builder, res,
+                             lp_build_vec_type(gallivm, dst_type), "");
+         if (vector_justify) {
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         res = LLVMBuildShl(gallivm->builder, res,
+                            LLVMConstInt(dst_elem_type,
+                                         dst_type.width - src_width, 0), "");
+#endif
+         }
+      }
+   }
+   return res;
+}
+
+
+
+
 static LLVMValueRef
 lp_build_gather_avx2(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
-                     unsigned dst_width,
+                     struct lp_type dst_type,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length);
-   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
-   LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length);
+   LLVMTypeRef src_type, src_vec_type;
    LLVMValueRef res;
+   struct lp_type res_type = dst_type;
+   res_type.length *= length;
 
+   if (dst_type.floating) {
+      src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
+                                   LLVMFloatTypeInContext(gallivm->context);
+   } else {
+      src_type = LLVMIntTypeInContext(gallivm->context, src_width);
+   }
+   src_vec_type = LLVMVectorType(src_type, length);
+
+   /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
 
    if (0) {
       /*
        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
-       * will not use the AVX2 gather instrinsics.  See
+       * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
+       * least with Haswell. See
        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
+       * And the generated code doing the emulation is quite a bit worse
+       * than what we get by doing it ourselves too.
        */
       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
@@ -176,7 +299,8 @@
       src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
 
       char intrinsic[64];
-      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width);
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
+                    length, dst_type.floating ? "f" : "i", src_width);
       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
@@ -185,27 +309,36 @@
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
    } else {
-      assert(src_width == 32);
-
       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
-
-      /*
-       * We should get the caller to give more type information so we can use
-       * the intrinsics for the right int/float domain.  Int should be the most
-       * common.
-       */
       const char *intrinsic = NULL;
-      switch (length) {
-      case 4:
-         intrinsic = "llvm.x86.avx2.gather.d.d";
-         break;
-      case 8:
-         intrinsic = "llvm.x86.avx2.gather.d.d.256";
-         break;
-      default:
-         assert(0);
+      unsigned l_idx = 0;
+
+      assert(src_width == 32 || src_width == 64);
+      if (src_width == 32) {
+         assert(length == 4 || length == 8);
+      } else {
+         assert(length == 2 || length == 4);
       }
 
+      static const char *intrinsics[2][2][2] = {
+
+         {{"llvm.x86.avx2.gather.d.d",
+           "llvm.x86.avx2.gather.d.d.256"},
+          {"llvm.x86.avx2.gather.d.q",
+           "llvm.x86.avx2.gather.d.q.256"}},
+
+         {{"llvm.x86.avx2.gather.d.ps",
+           "llvm.x86.avx2.gather.d.ps.256"},
+          {"llvm.x86.avx2.gather.d.pd",
+           "llvm.x86.avx2.gather.d.pd.256"}},
+      };
+
+      if ((src_width == 32 && length == 8) ||
+          (src_width == 64 && length == 4)) {
+         l_idx = 1;
+      }
+      intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
+
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
       mask = LLVMConstBitCast(mask, src_vec_type);
@@ -215,12 +348,7 @@
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
    }
-
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(builder, res, dst_vec_type, "");
-   } else if (src_width < dst_width) {
-      res = LLVMBuildZExt(builder, res, dst_vec_type, "");
-   }
+   res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
 
    return res;
 }
@@ -241,9 +369,11 @@
  *
  * @param length length of the offsets
  * @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param dst_type result element type (src will be expanded to fit,
+ *        but truncation is not allowed)
+ *        (this may be a vector, must be pot sized)
  * @param aligned whether the data is guaranteed to be aligned (to src_width)
- * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param base_ptr base pointer, needs to be a i8 pointer type.
  * @param offsets vector with offsets
  * @param vector_justify select vector rather than integer justification
  */
@@ -251,41 +381,121 @@
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
                 boolean vector_justify)
 {
    LLVMValueRef res;
+   boolean need_expansion = src_width < dst_type.width * dst_type.length;
+   boolean vec_fetch;
+   struct lp_type fetch_type, fetch_dst_type;
+   LLVMTypeRef src_type;
+
+   assert(src_width <= dst_type.width * dst_type.length);
+
+   /*
+    * This is quite a mess...
+    * Figure out if the fetch should be done as:
+    * a) scalar or vector
+    * b) float or int
+    *
+    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
+    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
+    * zext will cause extra instructions.
+    * However, the same isn't true for 3x16bit (the codegen for that is
+    * completely worthless on x86 simd, and for 3x8bit is is way worse
+    * still, don't try that... (To get really good code out of llvm for
+    * these cases, the only way is to decompose the fetches manually
+    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
+    * case requires sse41, otherwise simple scalar zext is way better.
+    * But probably not important enough, so don't bother.)
+    * Also, we try to honor the floating bit of destination (but isn't
+    * possible if caller asks for instance for 2x32bit dst_type with
+    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
+    * cast to 2x32f type, so the fetch is always int and on top of that
+    * we avoid the vec pad and use scalar zext due the above mentioned
+    * issue).
+    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
+    * for other archs if necessary...
+    */
+   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
+       (dst_type.length > 1)) {
+      /* use vector fetch (if dst_type is vector) */
+      vec_fetch = TRUE;
+      if (dst_type.floating) {
+         fetch_type = lp_type_float_vec(dst_type.width, src_width);
+      } else {
+         fetch_type = lp_type_int_vec(dst_type.width, src_width);
+      }
+      /* intentionally not using lp_build_vec_type here */
+      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
+                                fetch_type.length);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.length = dst_type.length;
+    } else {
+      /* use scalar fetch */
+      vec_fetch = FALSE;
+      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
+         fetch_type = lp_type_float(src_width);
+      } else {
+         fetch_type = lp_type_int(src_width);
+      }
+      src_type = lp_build_vec_type(gallivm, fetch_type);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.width = dst_type.width * dst_type.length;
+   }
 
    if (length == 1) {
       /* Scalar */
-      return lp_build_gather_elem(gallivm, length,
-                                  src_width, dst_width, aligned,
-                                  base_ptr, offsets, 0, vector_justify);
-   } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) {
-      return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets);
+      res = lp_build_gather_elem_vec(gallivm, length,
+                                     src_width, src_type, fetch_dst_type,
+                                     aligned, base_ptr, offsets, 0,
+                                     vector_justify);
+      return LLVMBuildBitCast(gallivm->builder, res,
+                              lp_build_vec_type(gallivm, dst_type), "");
+      /*
+       * Excluding expansion from these paths because if you need it for
+       * 32bit/64bit fetches you're doing it wrong (this is gather, not
+       * conversion) and it would be awkward for floats.
+       */
+   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 32 && (length == 4 || length == 8)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
+   /*
+    * This looks bad on paper wrt throughtput/latency on Haswell.
+    * Even on Broadwell it doesn't look stellar.
+    * Albeit no measurements were done (but tested to work).
+    * Should definitely enable on Skylake.
+    * (In general, should be more of a win if the fetch is 256bit wide -
+    * this is true for the 32bit case above too.)
+    */
+   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 64 && (length == 2 || length == 4)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
    } else {
       /* Vector */
 
-      LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-      LLVMTypeRef gather_vec_type = dst_vec_type;
+      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
       unsigned i;
       boolean vec_zext = FALSE;
-      unsigned gather_width = dst_width;
+      struct lp_type res_type, gather_res_type;
+      LLVMTypeRef res_t, gather_res_t;
 
+      res_type = fetch_dst_type;
+      res_type.length *= length;
+      gather_res_type = res_type;
 
-      if (src_width == 16 && dst_width == 32) {
-         LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width / 2);
-         gather_vec_type = LLVMVectorType(g_elem_type, length);
+      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
          /*
           * Note that llvm is never able to optimize zext/insert combos
           * directly (i.e. zero the simd reg, then place the elements into
-          * the appropriate place directly). And 16->32bit zext simd loads
+          * the appropriate place directly). (I think this has to do with
+          * scalar/vector transition.) And scalar 16->32bit zext simd loads
           * aren't possible (instead loading to scalar reg first).
-          * (I think this has to do with scalar/vector transition.)
           * No idea about other archs...
           * We could do this manually, but instead we just use a vector
           * zext, which is simple enough (and, in fact, llvm might optimize
@@ -293,30 +503,53 @@
           * (We're not trying that with other bit widths as that might not be
           * easier, in particular with 8 bit values at least with only sse2.)
           */
+         assert(vec_fetch == FALSE);
+         gather_res_type.width /= 2;
+         fetch_dst_type = fetch_type;
+         src_type = lp_build_vec_type(gallivm, fetch_type);
          vec_zext = TRUE;
-         gather_width = 16;
       }
-      res = LLVMGetUndef(gather_vec_type);
+      res_t = lp_build_vec_type(gallivm, res_type);
+      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
+      res = LLVMGetUndef(gather_res_t);
       for (i = 0; i < length; ++i) {
          LLVMValueRef index = lp_build_const_int32(gallivm, i);
-         LLVMValueRef elem;
-         elem = lp_build_gather_elem(gallivm, length,
-                                     src_width, gather_width, aligned,
-                                     base_ptr, offsets, i, vector_justify);
-         res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
+         elems[i] = lp_build_gather_elem_vec(gallivm, length,
+                                             src_width, src_type, fetch_dst_type,
+                                             aligned, base_ptr, offsets, i,
+                                             vector_justify);
+         if (!vec_fetch) {
+            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
+         }
       }
       if (vec_zext) {
-         res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, "");
+         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
          if (vector_justify) {
 #if PIPE_ARCH_BIG_ENDIAN
-            struct lp_type dst_type;
-            unsigned sv = dst_width - src_width;
-            dst_type = lp_type_uint_vec(dst_width, dst_width * length);
+            unsigned sv = dst_type.width - src_width;
             res = LLVMBuildShl(gallivm->builder, res,
-                               lp_build_const_int_vec(gallivm, dst_type, sv), "");
+                               lp_build_const_int_vec(gallivm, res_type, sv), "");
 #endif
          }
       }
+      if (vec_fetch) {
+         /*
+          * Do bitcast now otherwise llvm might get some funny ideas wrt
+          * float/int types...
+          */
+         for (i = 0; i < length; i++) {
+            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
+                                        lp_build_vec_type(gallivm, dst_type), "");
+         }
+         res = lp_build_concat(gallivm, elems, dst_type, length);
+      } else {
+         struct lp_type really_final_type = dst_type;
+         assert(res_type.length * res_type.width ==
+                dst_type.length * dst_type.width * length);
+         really_final_type.length *= length;
+         res = LLVMBuildBitCast(gallivm->builder, res,
+                                lp_build_vec_type(gallivm, really_final_type), "");
+      }
    }
 
    return res;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
index 3ede476..7930864 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -55,7 +55,7 @@
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index f91b761..c46749d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -579,10 +579,12 @@
    LLVMValueRef rgba8;
    struct lp_build_context u8n;
    LLVMTypeRef u8n_vec_type;
+   struct lp_type fetch_type;
 
    lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
    u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
+   fetch_type = lp_type_uint(bld->texel_type.width);
    if (util_format_is_rgba8_variant(bld->format_desc)) {
       /*
        * Given the format is a rgba8, just read the pixels as is,
@@ -591,7 +593,7 @@
       rgba8 = lp_build_gather(bld->gallivm,
                               bld->texel_type.length,
                               bld->format_desc->block.bits,
-                              bld->texel_type.width,
+                              fetch_type,
                               TRUE,
                               data_ptr, offset, TRUE);
 
@@ -925,14 +927,16 @@
             LLVMValueRef rgba8;
 
             if (util_format_is_rgba8_variant(bld->format_desc)) {
+               struct lp_type fetch_type;
                /*
                 * Given the format is a rgba8, just read the pixels as is,
                 * without any swizzling. Swizzling will be done later.
                 */
+               fetch_type = lp_type_uint(bld->texel_type.width);
                rgba8 = lp_build_gather(bld->gallivm,
                                        bld->texel_type.length,
                                        bld->format_desc->block.bits,
-                                       bld->texel_type.width,
+                                       fetch_type,
                                        TRUE,
                                        data_ptr, offset[k][j][i], TRUE);