src/gallium/auxiliary/gallivm/lp_bld_gather.c - platform/external/mesa3d - Git at Google

 /**************************************************************************
  *
  * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  **************************************************************************/


 #include "util/u_debug.h"
 #include "util/u_cpu_detect.h"
 #include "util/u_math.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_const.h"
 #include "lp_bld_format.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_type.h"
 #include "lp_bld_init.h"
 #include "lp_bld_intr.h"
 #include "lp_bld_pack.h"


 /**
  * Get the pointer to one element from scatter positions in memory.
  *
  * @sa lp_build_gather()
  */
 LLVMValueRef
 lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
                          unsigned length,
                          LLVMValueRef base_ptr,
                          LLVMValueRef offsets,
                          unsigned i)
 {
    LLVMValueRef offset;
    LLVMValueRef ptr;

    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

    if (length == 1) {
       assert(i == 0);
       offset = offsets;
    } else {
       LLVMValueRef index = lp_build_const_int32(gallivm, i);
       offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
    }

    ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");

    return ptr;
 }


 /**
  * Gather one element from scatter positions in memory.
  *
  * @sa lp_build_gather()
  */
 LLVMValueRef
 lp_build_gather_elem(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
                      unsigned dst_width,
                      boolean aligned,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets,
                      unsigned i,
                      boolean vector_justify)
 {
    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
    LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
    LLVMValueRef ptr;
    LLVMValueRef res;

    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
    ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
    res = LLVMBuildLoad(gallivm->builder, ptr, "");

    /* XXX
     * On some archs we probably really want to avoid having to deal
     * with alignments lower than 4 bytes (if fetch size is a power of
     * two >= 32). On x86 it doesn't matter, however.
     * We should be able to guarantee full alignment for any kind of texture
     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
     * but I don't think that's quite what we wanted).
     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
     * enforcing what we want (which is what d3d10 does, the offset needs to
     * be aligned to element size, but GL has bytes regardless of element
     * size which would only leave us with minimum alignment restriction of 16
     * which doesn't make much sense if the type isn't 4x32bit). Due to
     * translation of offsets to first_elem in sampler_views it actually seems
     * gallium could not do anything else except 16 no matter what...
     */
    if (!aligned) {
       LLVMSetAlignment(res, 1);
    } else if (!util_is_power_of_two(src_width)) {
       /*
        * Full alignment is impossible, assume the caller really meant
        * the individual elements were aligned (e.g. 3x32bit format).
        * And yes the generated code may otherwise crash, llvm will
        * really assume 128bit alignment with a 96bit fetch (I suppose
        * that makes sense as it can just assume the upper 32bit to be
        * whatever).
        * Maybe the caller should be able to explicitly set this, but
        * this should cover all the 3-channel formats.
        */
       if (((src_width / 24) * 24 == src_width) &&
            util_is_power_of_two(src_width / 24)) {
           LLVMSetAlignment(res, src_width / 24);
       } else {
          LLVMSetAlignment(res, 1);
       }
    }

    assert(src_width <= dst_width);
    if (src_width < dst_width) {
       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
       if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
          res = LLVMBuildShl(gallivm->builder, res,
                             LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
 #endif
       }
    }

    return res;
 }


 /**
  * Gather one element from scatter positions in memory.
  * Nearly the same as above, however the individual elements
  * may be vectors themselves, and fetches may be float type.
  * Can also do pad vector instead of ZExt.
  *
  * @sa lp_build_gather()
  */
 static LLVMValueRef
 lp_build_gather_elem_vec(struct gallivm_state *gallivm,
                          unsigned length,
                          unsigned src_width,
                          LLVMTypeRef src_type,
                          struct lp_type dst_type,
                          boolean aligned,
                          LLVMValueRef base_ptr,
                          LLVMValueRef offsets,
                          unsigned i,
                          boolean vector_justify)
 {
    LLVMValueRef ptr, res;
    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
    ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
    res = LLVMBuildLoad(gallivm->builder, ptr, "");

    /* XXX
     * On some archs we probably really want to avoid having to deal
     * with alignments lower than 4 bytes (if fetch size is a power of
     * two >= 32). On x86 it doesn't matter, however.
     * We should be able to guarantee full alignment for any kind of texture
     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
     * but I don't think that's quite what we wanted).
     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
     * enforcing what we want (which is what d3d10 does, the offset needs to
     * be aligned to element size, but GL has bytes regardless of element
     * size which would only leave us with minimum alignment restriction of 16
     * which doesn't make much sense if the type isn't 4x32bit). Due to
     * translation of offsets to first_elem in sampler_views it actually seems
     * gallium could not do anything else except 16 no matter what...
     */
    if (!aligned) {
       LLVMSetAlignment(res, 1);
    } else if (!util_is_power_of_two(src_width)) {
       /*
        * Full alignment is impossible, assume the caller really meant
        * the individual elements were aligned (e.g. 3x32bit format).
        * And yes the generated code may otherwise crash, llvm will
        * really assume 128bit alignment with a 96bit fetch (I suppose
        * that makes sense as it can just assume the upper 32bit to be
        * whatever).
        * Maybe the caller should be able to explicitly set this, but
        * this should cover all the 3-channel formats.
        */
       if (((src_width / 24) * 24 == src_width) &&
            util_is_power_of_two(src_width / 24)) {
           LLVMSetAlignment(res, src_width / 24);
       } else {
          LLVMSetAlignment(res, 1);
       }
    }

    assert(src_width <= dst_type.width * dst_type.length);
    if (src_width < dst_type.width * dst_type.length) {
       if (dst_type.length > 1) {
          res = lp_build_pad_vector(gallivm, res, dst_type.length);
          /*
           * vector_justify hopefully a non-issue since we only deal
           * with src_width >= 32 here?
           */
       } else {
          LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);

          /*
           * Only valid if src_ptr_type is int type...
           */
          res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");

          if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
          res = LLVMBuildShl(gallivm->builder, res,
                             LLVMConstInt(dst_elem_type,
                                          dst_type.width - src_width, 0), "");
 #endif
          }
       }
    }
    return res;
 }


 static LLVMValueRef
 lp_build_gather_avx2(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
                      struct lp_type dst_type,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets)
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMTypeRef src_type, src_vec_type;
    LLVMValueRef res;
    struct lp_type res_type = dst_type;
    res_type.length *= length;

    if (dst_type.floating) {
       src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
                                    LLVMFloatTypeInContext(gallivm->context);
    } else {
       src_type = LLVMIntTypeInContext(gallivm->context, src_width);
    }
    src_vec_type = LLVMVectorType(src_type, length);

    /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

    if (0) {
       /*
        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
        * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
        * least with Haswell. See
        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
        * And the generated code doing the emulation is quite a bit worse
        * than what we get by doing it ourselves too.
        */
       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
       LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
       LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
       LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
       LLVMValueRef src_ptr;

       base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");

       /* Rescale offsets from bytes to elements */
       LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
       scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
       assert(LLVMTypeOf(offsets) == i32_vec_type);
       offsets = LLVMBuildSDiv(builder, offsets, scale, "");

       src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");

       char intrinsic[64];
       util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
                     length, dst_type.floating ? "f" : "i", src_width);
       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);

       LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };

       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
    } else {
       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
       const char *intrinsic = NULL;
       unsigned l_idx = 0;

       assert(src_width == 32 || src_width == 64);
       if (src_width == 32) {
          assert(length == 4 || length == 8);
       } else {
          assert(length == 2 || length == 4);
       }

       static const char *intrinsics[2][2][2] = {

          {{"llvm.x86.avx2.gather.d.d",
            "llvm.x86.avx2.gather.d.d.256"},
           {"llvm.x86.avx2.gather.d.q",
            "llvm.x86.avx2.gather.d.q.256"}},

          {{"llvm.x86.avx2.gather.d.ps",
            "llvm.x86.avx2.gather.d.ps.256"},
           {"llvm.x86.avx2.gather.d.pd",
            "llvm.x86.avx2.gather.d.pd.256"}},
       };

       if ((src_width == 32 && length == 8) ||
           (src_width == 64 && length == 4)) {
          l_idx = 1;
       }
       intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];

       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
       mask = LLVMConstBitCast(mask, src_vec_type);
       LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);

       LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };

       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
    }
    res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");

    return res;
 }


 /**
  * Gather elements from scatter positions in memory into a single vector.
  * Use for fetching texels from a texture.
  * For SSE, typical values are length=4, src_width=32, dst_width=32.
  *
  * When src_width < dst_width, the return value can be justified in
  * one of two ways:
  * "integer justification" is used when the caller treats the destination
  * as a packed integer bitmask, as described by the channels' "shift" and
  * "width" fields;
  * "vector justification" is used when the caller casts the destination
  * to a vector and needs channel X to be in vector element 0.
  *
  * @param length length of the offsets
  * @param src_width src element width in bits
  * @param dst_type result element type (src will be expanded to fit,
  *        but truncation is not allowed)
  *        (this may be a vector, must be pot sized)
  * @param aligned whether the data is guaranteed to be aligned (to src_width)
  * @param base_ptr base pointer, needs to be a i8 pointer type.
  * @param offsets vector with offsets
  * @param vector_justify select vector rather than integer justification
  */
 LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
                 struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
                 boolean vector_justify)
 {
    LLVMValueRef res;
    boolean need_expansion = src_width < dst_type.width * dst_type.length;
    boolean vec_fetch;
    struct lp_type fetch_type, fetch_dst_type;
    LLVMTypeRef src_type;

    assert(src_width <= dst_type.width * dst_type.length);

    /*
     * This is quite a mess...
     * Figure out if the fetch should be done as:
     * a) scalar or vector
     * b) float or int
     *
     * As an example, for a 96bit fetch expanded into 4x32bit, it is better
     * to use (3x32bit) vector type (then pad the vector). Otherwise, the
     * zext will cause extra instructions.
     * However, the same isn't true for 3x16bit (the codegen for that is
     * completely worthless on x86 simd, and for 3x8bit is is way worse
     * still, don't try that... (To get really good code out of llvm for
     * these cases, the only way is to decompose the fetches manually
     * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
     * case requires sse41, otherwise simple scalar zext is way better.
     * But probably not important enough, so don't bother.)
     * Also, we try to honor the floating bit of destination (but isn't
     * possible if caller asks for instance for 2x32bit dst_type with
     * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
     * cast to 2x32f type, so the fetch is always int and on top of that
     * we avoid the vec pad and use scalar zext due the above mentioned
     * issue).
     * Note this is optimized for x86 sse2 and up backend. Could be tweaked
     * for other archs if necessary...
     */
    if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
        (dst_type.length > 1)) {
       /* use vector fetch (if dst_type is vector) */
       vec_fetch = TRUE;
       if (dst_type.floating) {
          fetch_type = lp_type_float_vec(dst_type.width, src_width);
       } else {
          fetch_type = lp_type_int_vec(dst_type.width, src_width);
       }
       /* intentionally not using lp_build_vec_type here */
       src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
                                 fetch_type.length);
       fetch_dst_type = fetch_type;
       fetch_dst_type.length = dst_type.length;
     } else {
       /* use scalar fetch */
       vec_fetch = FALSE;
       if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
          fetch_type = lp_type_float(src_width);
       } else {
          fetch_type = lp_type_int(src_width);
       }
       src_type = lp_build_vec_type(gallivm, fetch_type);
       fetch_dst_type = fetch_type;
       fetch_dst_type.width = dst_type.width * dst_type.length;
    }

    if (length == 1) {
       /* Scalar */
       res = lp_build_gather_elem_vec(gallivm, length,
                                      src_width, src_type, fetch_dst_type,
                                      aligned, base_ptr, offsets, 0,
                                      vector_justify);
       return LLVMBuildBitCast(gallivm->builder, res,
                               lp_build_vec_type(gallivm, dst_type), "");
       /*
        * Excluding expansion from these paths because if you need it for
        * 32bit/64bit fetches you're doing it wrong (this is gather, not
        * conversion) and it would be awkward for floats.
        */
    } else if (util_cpu_caps.has_avx2 && !need_expansion &&
               src_width == 32 && (length == 4 || length == 8)) {
       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
                                   base_ptr, offsets);
    /*
     * This looks bad on paper wrt throughtput/latency on Haswell.
     * Even on Broadwell it doesn't look stellar.
     * Albeit no measurements were done (but tested to work).
     * Should definitely enable on Skylake.
     * (In general, should be more of a win if the fetch is 256bit wide -
     * this is true for the 32bit case above too.)
     */
    } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
               src_width == 64 && (length == 2 || length == 4)) {
       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
                                   base_ptr, offsets);
    } else {
       /* Vector */

       LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
       unsigned i;
       boolean vec_zext = FALSE;
       struct lp_type res_type, gather_res_type;
       LLVMTypeRef res_t, gather_res_t;

       res_type = fetch_dst_type;
       res_type.length *= length;
       gather_res_type = res_type;

       if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
          /*
           * Note that llvm is never able to optimize zext/insert combos
           * directly (i.e. zero the simd reg, then place the elements into
           * the appropriate place directly). (I think this has to do with
           * scalar/vector transition.) And scalar 16->32bit zext simd loads
           * aren't possible (instead loading to scalar reg first).
           * No idea about other archs...
           * We could do this manually, but instead we just use a vector
           * zext, which is simple enough (and, in fact, llvm might optimize
           * this away).
           * (We're not trying that with other bit widths as that might not be
           * easier, in particular with 8 bit values at least with only sse2.)
           */
          assert(vec_fetch == FALSE);
          gather_res_type.width /= 2;
          fetch_dst_type = fetch_type;
          src_type = lp_build_vec_type(gallivm, fetch_type);
          vec_zext = TRUE;
       }
       res_t = lp_build_vec_type(gallivm, res_type);
       gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
       res = LLVMGetUndef(gather_res_t);
       for (i = 0; i < length; ++i) {
          LLVMValueRef index = lp_build_const_int32(gallivm, i);
          elems[i] = lp_build_gather_elem_vec(gallivm, length,
                                              src_width, src_type, fetch_dst_type,
                                              aligned, base_ptr, offsets, i,
                                              vector_justify);
          if (!vec_fetch) {
             res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
          }
       }
       if (vec_zext) {
          res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
          if (vector_justify) {
 #if PIPE_ARCH_BIG_ENDIAN
             unsigned sv = dst_type.width - src_width;
             res = LLVMBuildShl(gallivm->builder, res,
                                lp_build_const_int_vec(gallivm, res_type, sv), "");
 #endif
          }
       }
       if (vec_fetch) {
          /*
           * Do bitcast now otherwise llvm might get some funny ideas wrt
           * float/int types...
           */
          for (i = 0; i < length; i++) {
             elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
                                         lp_build_vec_type(gallivm, dst_type), "");
          }
          res = lp_build_concat(gallivm, elems, dst_type, length);
       } else {
          struct lp_type really_final_type = dst_type;
          assert(res_type.length * res_type.width ==
                 dst_type.length * dst_type.width * length);
          really_final_type.length *= length;
          res = LLVMBuildBitCast(gallivm->builder, res,
                                 lp_build_vec_type(gallivm, really_final_type), "");
       }
    }

    return res;
 }

 LLVMValueRef
 lp_build_gather_values(struct gallivm_state * gallivm,
                        LLVMValueRef * values,
                        unsigned value_count)
 {
    LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef vec = LLVMGetUndef(vec_type);
    unsigned i;

    for (i = 0; i < value_count; i++) {
       LLVMValueRef index = lp_build_const_int32(gallivm, i);
       vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
    }
    return vec;
 }
	/**************************************************************************
	*
	* Copyright 2010 VMware, Inc.
	* All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the
	* "Software"), to deal in the Software without restriction, including
	* without limitation the rights to use, copy, modify, merge, publish,
	* distribute, sub license, and/or sell copies of the Software, and to
	* permit persons to whom the Software is furnished to do so, subject to
	* the following conditions:
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
	* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
	* USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	* The above copyright notice and this permission notice (including the
	* next paragraph) shall be included in all copies or substantial portions
	* of the Software.
	*
	**************************************************************************/


	#include "util/u_debug.h"
	#include "util/u_cpu_detect.h"
	#include "util/u_math.h"
	#include "lp_bld_debug.h"
	#include "lp_bld_const.h"
	#include "lp_bld_format.h"
	#include "lp_bld_gather.h"
	#include "lp_bld_swizzle.h"
	#include "lp_bld_type.h"
	#include "lp_bld_init.h"
	#include "lp_bld_intr.h"
	#include "lp_bld_pack.h"


	/**
	* Get the pointer to one element from scatter positions in memory.
	*
	* @sa lp_build_gather()
	*/
	LLVMValueRef
	lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
	unsigned length,
	LLVMValueRef base_ptr,
	LLVMValueRef offsets,
	unsigned i)
	{
	LLVMValueRef offset;
	LLVMValueRef ptr;

	assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

	if (length == 1) {
	assert(i == 0);
	offset = offsets;
	} else {
	LLVMValueRef index = lp_build_const_int32(gallivm, i);
	offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
	}

	ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");

	return ptr;
	}


	/**
	* Gather one element from scatter positions in memory.
	*
	* @sa lp_build_gather()
	*/
	LLVMValueRef
	lp_build_gather_elem(struct gallivm_state *gallivm,
	unsigned length,
	unsigned src_width,
	unsigned dst_width,
	boolean aligned,
	LLVMValueRef base_ptr,
	LLVMValueRef offsets,
	unsigned i,
	boolean vector_justify)
	{
	LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
	LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
	LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
	LLVMValueRef ptr;
	LLVMValueRef res;

	assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

	ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
	ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
	res = LLVMBuildLoad(gallivm->builder, ptr, "");

	/* XXX
	* On some archs we probably really want to avoid having to deal
	* with alignments lower than 4 bytes (if fetch size is a power of
	* two >= 32). On x86 it doesn't matter, however.
	* We should be able to guarantee full alignment for any kind of texture
	* fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
	* (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
	* but I don't think that's quite what we wanted).
	* For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
	* looks like a good fit, but it seems this cap bit (and OpenGL) aren't
	* enforcing what we want (which is what d3d10 does, the offset needs to
	* be aligned to element size, but GL has bytes regardless of element
	* size which would only leave us with minimum alignment restriction of 16
	* which doesn't make much sense if the type isn't 4x32bit). Due to
	* translation of offsets to first_elem in sampler_views it actually seems
	* gallium could not do anything else except 16 no matter what...
	*/
	if (!aligned) {
	LLVMSetAlignment(res, 1);
	} else if (!util_is_power_of_two(src_width)) {
	/*
	* Full alignment is impossible, assume the caller really meant
	* the individual elements were aligned (e.g. 3x32bit format).
	* And yes the generated code may otherwise crash, llvm will
	* really assume 128bit alignment with a 96bit fetch (I suppose
	* that makes sense as it can just assume the upper 32bit to be
	* whatever).
	* Maybe the caller should be able to explicitly set this, but
	* this should cover all the 3-channel formats.
	*/
	if (((src_width / 24) * 24 == src_width) &&
	util_is_power_of_two(src_width / 24)) {
	LLVMSetAlignment(res, src_width / 24);
	} else {
	LLVMSetAlignment(res, 1);
	}
	}

	assert(src_width <= dst_width);
	if (src_width < dst_width) {
	res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
	if (vector_justify) {
	#ifdef PIPE_ARCH_BIG_ENDIAN
	res = LLVMBuildShl(gallivm->builder, res,
	LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
	#endif
	}
	}

	return res;
	}


	/**
	* Gather one element from scatter positions in memory.
	* Nearly the same as above, however the individual elements
	* may be vectors themselves, and fetches may be float type.
	* Can also do pad vector instead of ZExt.
	*
	* @sa lp_build_gather()
	*/
	static LLVMValueRef
	lp_build_gather_elem_vec(struct gallivm_state *gallivm,
	unsigned length,
	unsigned src_width,
	LLVMTypeRef src_type,
	struct lp_type dst_type,
	boolean aligned,
	LLVMValueRef base_ptr,
	LLVMValueRef offsets,
	unsigned i,
	boolean vector_justify)
	{
	LLVMValueRef ptr, res;
	LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
	assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

	ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
	ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
	res = LLVMBuildLoad(gallivm->builder, ptr, "");

	/* XXX
	* On some archs we probably really want to avoid having to deal
	* with alignments lower than 4 bytes (if fetch size is a power of
	* two >= 32). On x86 it doesn't matter, however.
	* We should be able to guarantee full alignment for any kind of texture
	* fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
	* (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
	* but I don't think that's quite what we wanted).
	* For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
	* looks like a good fit, but it seems this cap bit (and OpenGL) aren't
	* enforcing what we want (which is what d3d10 does, the offset needs to
	* be aligned to element size, but GL has bytes regardless of element
	* size which would only leave us with minimum alignment restriction of 16
	* which doesn't make much sense if the type isn't 4x32bit). Due to
	* translation of offsets to first_elem in sampler_views it actually seems
	* gallium could not do anything else except 16 no matter what...
	*/
	if (!aligned) {
	LLVMSetAlignment(res, 1);
	} else if (!util_is_power_of_two(src_width)) {
	/*
	* Full alignment is impossible, assume the caller really meant
	* the individual elements were aligned (e.g. 3x32bit format).
	* And yes the generated code may otherwise crash, llvm will
	* really assume 128bit alignment with a 96bit fetch (I suppose
	* that makes sense as it can just assume the upper 32bit to be
	* whatever).
	* Maybe the caller should be able to explicitly set this, but
	* this should cover all the 3-channel formats.
	*/
	if (((src_width / 24) * 24 == src_width) &&
	util_is_power_of_two(src_width / 24)) {
	LLVMSetAlignment(res, src_width / 24);
	} else {
	LLVMSetAlignment(res, 1);
	}
	}

	assert(src_width <= dst_type.width * dst_type.length);
	if (src_width < dst_type.width * dst_type.length) {
	if (dst_type.length > 1) {
	res = lp_build_pad_vector(gallivm, res, dst_type.length);
	/*
	* vector_justify hopefully a non-issue since we only deal
	* with src_width >= 32 here?
	*/
	} else {
	LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);

	/*
	* Only valid if src_ptr_type is int type...
	*/
	res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");

	if (vector_justify) {
	#ifdef PIPE_ARCH_BIG_ENDIAN
	res = LLVMBuildShl(gallivm->builder, res,
	LLVMConstInt(dst_elem_type,
	dst_type.width - src_width, 0), "");
	#endif
	}
	}
	}
	return res;
	}




	static LLVMValueRef
	lp_build_gather_avx2(struct gallivm_state *gallivm,
	unsigned length,
	unsigned src_width,
	struct lp_type dst_type,
	LLVMValueRef base_ptr,
	LLVMValueRef offsets)
	{
	LLVMBuilderRef builder = gallivm->builder;
	LLVMTypeRef src_type, src_vec_type;
	LLVMValueRef res;
	struct lp_type res_type = dst_type;
	res_type.length *= length;

	if (dst_type.floating) {
	src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
	LLVMFloatTypeInContext(gallivm->context);
	} else {
	src_type = LLVMIntTypeInContext(gallivm->context, src_width);
	}
	src_vec_type = LLVMVectorType(src_type, length);

	/* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
	assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));

	if (0) {
	/*
	* XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
	* will not use the AVX2 gather instrinsics (even with llvm 4.0), at
	* least with Haswell. See
	* http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
	* And the generated code doing the emulation is quite a bit worse
	* than what we get by doing it ourselves too.
	*/
	LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
	LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
	LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
	LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
	LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
	LLVMValueRef src_ptr;

	base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");

	/* Rescale offsets from bytes to elements */
	LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
	scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
	assert(LLVMTypeOf(offsets) == i32_vec_type);
	offsets = LLVMBuildSDiv(builder, offsets, scale, "");

	src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");

	char intrinsic[64];
	util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
	length, dst_type.floating ? "f" : "i", src_width);
	LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
	LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
	LLVMValueRef passthru = LLVMGetUndef(src_vec_type);

	LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };

	res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
	} else {
	LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
	const char *intrinsic = NULL;
	unsigned l_idx = 0;

	assert(src_width == 32 \|\| src_width == 64);
	if (src_width == 32) {
	assert(length == 4 \|\| length == 8);
	} else {
	assert(length == 2 \|\| length == 4);
	}

	static const char *intrinsics[2][2][2] = {

	{{"llvm.x86.avx2.gather.d.d",
	"llvm.x86.avx2.gather.d.d.256"},
	{"llvm.x86.avx2.gather.d.q",
	"llvm.x86.avx2.gather.d.q.256"}},

	{{"llvm.x86.avx2.gather.d.ps",
	"llvm.x86.avx2.gather.d.ps.256"},
	{"llvm.x86.avx2.gather.d.pd",
	"llvm.x86.avx2.gather.d.pd.256"}},
	};

	if ((src_width == 32 && length == 8) \|\|
	(src_width == 64 && length == 4)) {
	l_idx = 1;
	}
	intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];

	LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
	LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
	mask = LLVMConstBitCast(mask, src_vec_type);
	LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);

	LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };

	res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
	}
	res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");

	return res;
	}


	/**
	* Gather elements from scatter positions in memory into a single vector.
	* Use for fetching texels from a texture.
	* For SSE, typical values are length=4, src_width=32, dst_width=32.
	*
	* When src_width < dst_width, the return value can be justified in
	* one of two ways:
	* "integer justification" is used when the caller treats the destination
	* as a packed integer bitmask, as described by the channels' "shift" and
	* "width" fields;
	* "vector justification" is used when the caller casts the destination
	* to a vector and needs channel X to be in vector element 0.
	*
	* @param length length of the offsets
	* @param src_width src element width in bits
	* @param dst_type result element type (src will be expanded to fit,
	* but truncation is not allowed)
	* (this may be a vector, must be pot sized)
	* @param aligned whether the data is guaranteed to be aligned (to src_width)
	* @param base_ptr base pointer, needs to be a i8 pointer type.
	* @param offsets vector with offsets
	* @param vector_justify select vector rather than integer justification
	*/
	LLVMValueRef
	lp_build_gather(struct gallivm_state *gallivm,
	unsigned length,
	unsigned src_width,
	struct lp_type dst_type,
	boolean aligned,
	LLVMValueRef base_ptr,
	LLVMValueRef offsets,
	boolean vector_justify)
	{
	LLVMValueRef res;
	boolean need_expansion = src_width < dst_type.width * dst_type.length;
	boolean vec_fetch;
	struct lp_type fetch_type, fetch_dst_type;
	LLVMTypeRef src_type;

	assert(src_width <= dst_type.width * dst_type.length);

	/*
	* This is quite a mess...
	* Figure out if the fetch should be done as:
	* a) scalar or vector
	* b) float or int
	*
	* As an example, for a 96bit fetch expanded into 4x32bit, it is better
	* to use (3x32bit) vector type (then pad the vector). Otherwise, the
	* zext will cause extra instructions.
	* However, the same isn't true for 3x16bit (the codegen for that is
	* completely worthless on x86 simd, and for 3x8bit is is way worse
	* still, don't try that... (To get really good code out of llvm for
	* these cases, the only way is to decompose the fetches manually
	* into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
	* case requires sse41, otherwise simple scalar zext is way better.
	* But probably not important enough, so don't bother.)
	* Also, we try to honor the floating bit of destination (but isn't
	* possible if caller asks for instance for 2x32bit dst_type with
	* 48bit fetch - the idea would be to use 3x16bit fetch, pad and
	* cast to 2x32f type, so the fetch is always int and on top of that
	* we avoid the vec pad and use scalar zext due the above mentioned
	* issue).
	* Note this is optimized for x86 sse2 and up backend. Could be tweaked
	* for other archs if necessary...
	*/
	if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
	(dst_type.length > 1)) {
	/* use vector fetch (if dst_type is vector) */
	vec_fetch = TRUE;
	if (dst_type.floating) {
	fetch_type = lp_type_float_vec(dst_type.width, src_width);
	} else {
	fetch_type = lp_type_int_vec(dst_type.width, src_width);
	}
	/* intentionally not using lp_build_vec_type here */
	src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
	fetch_type.length);
	fetch_dst_type = fetch_type;
	fetch_dst_type.length = dst_type.length;
	} else {
	/* use scalar fetch */
	vec_fetch = FALSE;
	if (dst_type.floating && ((src_width == 32) \|\| (src_width == 64))) {
	fetch_type = lp_type_float(src_width);
	} else {
	fetch_type = lp_type_int(src_width);
	}
	src_type = lp_build_vec_type(gallivm, fetch_type);
	fetch_dst_type = fetch_type;
	fetch_dst_type.width = dst_type.width * dst_type.length;
	}

	if (length == 1) {
	/* Scalar */
	res = lp_build_gather_elem_vec(gallivm, length,
	src_width, src_type, fetch_dst_type,
	aligned, base_ptr, offsets, 0,
	vector_justify);
	return LLVMBuildBitCast(gallivm->builder, res,
	lp_build_vec_type(gallivm, dst_type), "");
	/*
	* Excluding expansion from these paths because if you need it for
	* 32bit/64bit fetches you're doing it wrong (this is gather, not
	* conversion) and it would be awkward for floats.
	*/
	} else if (util_cpu_caps.has_avx2 && !need_expansion &&
	src_width == 32 && (length == 4 \|\| length == 8)) {
	return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
	base_ptr, offsets);
	/*
	* This looks bad on paper wrt throughtput/latency on Haswell.
	* Even on Broadwell it doesn't look stellar.
	* Albeit no measurements were done (but tested to work).
	* Should definitely enable on Skylake.
	* (In general, should be more of a win if the fetch is 256bit wide -
	* this is true for the 32bit case above too.)
	*/
	} else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
	src_width == 64 && (length == 2 \|\| length == 4)) {
	return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
	base_ptr, offsets);
	} else {
	/* Vector */

	LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
	unsigned i;
	boolean vec_zext = FALSE;
	struct lp_type res_type, gather_res_type;
	LLVMTypeRef res_t, gather_res_t;

	res_type = fetch_dst_type;
	res_type.length *= length;
	gather_res_type = res_type;

	if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
	/*
	* Note that llvm is never able to optimize zext/insert combos
	* directly (i.e. zero the simd reg, then place the elements into
	* the appropriate place directly). (I think this has to do with
	* scalar/vector transition.) And scalar 16->32bit zext simd loads
	* aren't possible (instead loading to scalar reg first).
	* No idea about other archs...
	* We could do this manually, but instead we just use a vector
	* zext, which is simple enough (and, in fact, llvm might optimize
	* this away).
	* (We're not trying that with other bit widths as that might not be
	* easier, in particular with 8 bit values at least with only sse2.)
	*/
	assert(vec_fetch == FALSE);
	gather_res_type.width /= 2;
	fetch_dst_type = fetch_type;
	src_type = lp_build_vec_type(gallivm, fetch_type);
	vec_zext = TRUE;
	}
	res_t = lp_build_vec_type(gallivm, res_type);
	gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
	res = LLVMGetUndef(gather_res_t);
	for (i = 0; i < length; ++i) {
	LLVMValueRef index = lp_build_const_int32(gallivm, i);
	elems[i] = lp_build_gather_elem_vec(gallivm, length,
	src_width, src_type, fetch_dst_type,
	aligned, base_ptr, offsets, i,
	vector_justify);
	if (!vec_fetch) {
	res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
	}
	}
	if (vec_zext) {
	res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
	if (vector_justify) {
	#if PIPE_ARCH_BIG_ENDIAN
	unsigned sv = dst_type.width - src_width;
	res = LLVMBuildShl(gallivm->builder, res,
	lp_build_const_int_vec(gallivm, res_type, sv), "");
	#endif
	}
	}
	if (vec_fetch) {
	/*
	* Do bitcast now otherwise llvm might get some funny ideas wrt
	* float/int types...
	*/
	for (i = 0; i < length; i++) {
	elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
	lp_build_vec_type(gallivm, dst_type), "");
	}
	res = lp_build_concat(gallivm, elems, dst_type, length);
	} else {
	struct lp_type really_final_type = dst_type;
	assert(res_type.length * res_type.width ==
	dst_type.length * dst_type.width * length);
	really_final_type.length *= length;
	res = LLVMBuildBitCast(gallivm->builder, res,
	lp_build_vec_type(gallivm, really_final_type), "");
	}
	}

	return res;
	}

	LLVMValueRef
	lp_build_gather_values(struct gallivm_state * gallivm,
	LLVMValueRef * values,
	unsigned value_count)
	{
	LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
	LLVMBuilderRef builder = gallivm->builder;
	LLVMValueRef vec = LLVMGetUndef(vec_type);
	unsigned i;

	for (i = 0; i < value_count; i++) {
	LLVMValueRef index = lp_build_const_int32(gallivm, i);
	vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
	}
	return vec;
	}