src/gallium/drivers/llvmpipe/lp_bld_interp.c - platform/external/mesa3d - Git at Google

 /**************************************************************************
  *
  * Copyright 2009 VMware, Inc.
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  **************************************************************************/

 /**
  * @file
  * Position and shader input interpolation.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */

 #include "pipe/p_shader_tokens.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "tgsi/tgsi_scan.h"
 #include "gallivm/lp_bld_debug.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_flow.h"
 #include "lp_bld_interp.h"


 /*
  * The shader JIT function operates on blocks of quads.
  * Each block has 2x2 quads and each quad has 2x2 pixels.
  *
  * We iterate over the quads in order 0, 1, 2, 3:
  *
  * #################
  * #   |   #   |   #
  * #---0---#---1---#
  * #   |   #   |   #
  * #################
  * #   |   #   |   #
  * #---2---#---3---#
  * #   |   #   |   #
  * #################
  *
  * If we iterate over multiple quads at once, quads 01 and 23 are processed
  * together.
  *
  * Within each quad, we have four pixels which are represented in SOA
  * order:
  *
  * #########
  * # 0 | 1 #
  * #---+---#
  * # 2 | 3 #
  * #########
  *
  * So the green channel (for example) of the four pixels is stored in
  * a single vector register: {g0, g1, g2, g3}.
  * The order stays the same even with multiple quads:
  * 0 1 4 5
  * 2 3 6 7
  * is stored as g0..g7
  */


 /**
  * Do one perspective divide per quad.
  *
  * For perspective interpolation, the final attribute value is given
  *
  *  a' = a/w = a * oow
  *
  * where
  *
  *  a = a0 + dadx*x + dady*y
  *  w = w0 + dwdx*x + dwdy*y
  *  oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
  *
  * Instead of computing the division per pixel, with this macro we compute the
  * division on the upper left pixel of each quad, and use a linear
  * approximation in the remaining pixels, given by:
  *
  *  da'dx = (dadx - dwdx*a)*oow
  *  da'dy = (dady - dwdy*a)*oow
  *
  * Ironically, this actually makes things slower -- probably because the
  * divide hardware unit is rarely used, whereas the multiply unit is typically
  * already saturated.
  */
 #define PERSPECTIVE_DIVIDE_PER_QUAD 0


 static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
 static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};


 static void
 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
 {
    if(attrib == 0)
       lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
    else
       lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
 }

 static void
 calc_offsets(struct lp_build_context *coeff_bld,
              unsigned quad_start_index,
              LLVMValueRef *pixoffx,
              LLVMValueRef *pixoffy)
 {
    unsigned i;
    unsigned num_pix = coeff_bld->type.length;
    struct gallivm_state *gallivm = coeff_bld->gallivm;
    LLVMBuilderRef builder = coeff_bld->gallivm->builder;
    LLVMValueRef nr, pixxf, pixyf;

    *pixoffx = coeff_bld->undef;
    *pixoffy = coeff_bld->undef;

    for (i = 0; i < num_pix; i++) {
       nr = lp_build_const_int32(gallivm, i);
       pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
                                    (quad_start_index & 1) * 2);
       pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
                                    (quad_start_index & 2));
       *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
       *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
    }
 }


 /* Much easier, and significantly less instructions in the per-stamp
  * part (less than half) but overall more instructions so a loss if
  * most quads are active. Might be a win though with larger vectors.
  * No ability to do per-quad divide (doable but not implemented)
  * Could be made to work with passed in pixel offsets (i.e. active quad merging).
  */
 static void
 coeffs_init_simple(struct lp_build_interp_soa_context *bld,
                    LLVMValueRef a0_ptr,
                    LLVMValueRef dadx_ptr,
                    LLVMValueRef dady_ptr)
 {
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    struct lp_build_context *setup_bld = &bld->setup_bld;
    struct gallivm_state *gallivm = coeff_bld->gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    unsigned attrib;

    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
       /*
        * always fetch all 4 values for performance/simplicity
        * Note: we do that here because it seems to generate better
        * code. It generates a lot of moves initially but less
        * moves later. As far as I can tell this looks like a
        * llvm issue, instead of simply reloading the values from
        * the passed in pointers it if it runs out of registers
        * it spills/reloads them. Maybe some optimization passes
        * would help.
        * Might want to investigate this again later.
        */
       const unsigned interp = bld->interp[attrib];
       LLVMValueRef index = lp_build_const_int32(gallivm,
                                 attrib * TGSI_NUM_CHANNELS);
       LLVMValueRef ptr;
       LLVMValueRef dadxaos = setup_bld->zero;
       LLVMValueRef dadyaos = setup_bld->zero;
       LLVMValueRef a0aos = setup_bld->zero;

       switch (interp) {
       case LP_INTERP_PERSPECTIVE:
          /* fall-through */

       case LP_INTERP_LINEAR:
          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
          ptr = LLVMBuildBitCast(builder, ptr,
                LLVMPointerType(setup_bld->vec_type, 0), "");
          dadxaos = LLVMBuildLoad(builder, ptr, "");

          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
          ptr = LLVMBuildBitCast(builder, ptr,
                LLVMPointerType(setup_bld->vec_type, 0), "");
          dadyaos = LLVMBuildLoad(builder, ptr, "");

          attrib_name(dadxaos, attrib, 0, ".dadxaos");
          attrib_name(dadyaos, attrib, 0, ".dadyaos");
          /* fall-through */

       case LP_INTERP_CONSTANT:
       case LP_INTERP_FACING:
          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
          ptr = LLVMBuildBitCast(builder, ptr,
                LLVMPointerType(setup_bld->vec_type, 0), "");
          a0aos = LLVMBuildLoad(builder, ptr, "");
          attrib_name(a0aos, attrib, 0, ".a0aos");
          break;

       case LP_INTERP_POSITION:
          /* Nothing to do as the position coeffs are already setup in slot 0 */
          continue;

       default:
          assert(0);
          break;
       }
       bld->a0aos[attrib] = a0aos;
       bld->dadxaos[attrib] = dadxaos;
       bld->dadyaos[attrib] = dadyaos;
    }
 }

 /**
  * Interpolate the shader input attribute values.
  * This is called for each (group of) quad(s).
  */
 static void
 attribs_update_simple(struct lp_build_interp_soa_context *bld,
                       struct gallivm_state *gallivm,
                       int quad_start_index,
                       LLVMValueRef loop_iter,
                       int start,
                       int end)
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    struct lp_build_context *setup_bld = &bld->setup_bld;
    LLVMValueRef oow = NULL;
    unsigned attrib;
    LLVMValueRef pixoffx;
    LLVMValueRef pixoffy;

    /* could do this with code-generated passed in pixel offsets too */
    if (bld->dynamic_offsets) {
       LLVMValueRef ptr;

       assert(loop_iter);
       ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
       pixoffx = LLVMBuildLoad(builder, ptr, "");
       ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
       pixoffy = LLVMBuildLoad(builder, ptr, "");
    }
    else {
       calc_offsets(coeff_bld, quad_start_index, &pixoffx, &pixoffy);
    }

    pixoffx = LLVMBuildFAdd(builder, pixoffx,
                            lp_build_broadcast_scalar(coeff_bld, bld->x), "");
    pixoffy = LLVMBuildFAdd(builder, pixoffy,
                            lp_build_broadcast_scalar(coeff_bld, bld->y), "");

    for (attrib = start; attrib < end; attrib++) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
       unsigned chan;

       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
          if (mask & (1 << chan)) {
             LLVMValueRef index;
             LLVMValueRef dadx = coeff_bld->zero;
             LLVMValueRef dady = coeff_bld->zero;
             LLVMValueRef a = coeff_bld->zero;

             index = lp_build_const_int32(gallivm, chan);
             switch (interp) {
             case LP_INTERP_PERSPECTIVE:
                /* fall-through */

             case LP_INTERP_LINEAR:
                if (attrib == 0 && chan == 0) {
                   dadx = coeff_bld->one;
                }
                else if (attrib == 0 && chan == 1) {
                   dady = coeff_bld->one;
                }
                else {
                   dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
                                                     coeff_bld->type, bld->dadxaos[attrib],
                                                     index);
                   dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
                                                     coeff_bld->type, bld->dadyaos[attrib],
                                                     index);
                   a = lp_build_extract_broadcast(gallivm, setup_bld->type,
                                                  coeff_bld->type, bld->a0aos[attrib],
                                                  index);
                }
                /*
                 * a = a0 + (x * dadx + y * dady)
                 */
                dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
                dady = LLVMBuildFMul(builder, dady, pixoffy, "");
                a = LLVMBuildFAdd(builder, a, dadx, "");
                a = LLVMBuildFAdd(builder, a, dady, "");

                if (interp == LP_INTERP_PERSPECTIVE) {
                   if (oow == NULL) {
                      LLVMValueRef w = bld->attribs[0][3];
                      assert(attrib != 0);
                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
                      oow = lp_build_rcp(coeff_bld, w);
                   }
                   a = lp_build_mul(coeff_bld, a, oow);
                }
                break;

             case LP_INTERP_CONSTANT:
             case LP_INTERP_FACING:
                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
                                               coeff_bld->type, bld->a0aos[attrib],
                                               index);
                break;

             case LP_INTERP_POSITION:
                assert(attrib > 0);
                a = bld->attribs[0][chan];
                break;

             default:
                assert(0);
                break;
             }

             if ((attrib == 0) && (chan == 2)){
                /* FIXME: Depth values can exceed 1.0, due to the fact that
                 * setup interpolation coefficients refer to (0,0) which causes
                 * precision loss. So we must clamp to 1.0 here to avoid artifacts
                 */
                a = lp_build_min(coeff_bld, a, coeff_bld->one);
             }
             bld->attribs[attrib][chan] = a;
          }
       }
    }
 }

 /**
  * Initialize the bld->a, dadq fields.  This involves fetching
  * those values from the arrays which are passed into the JIT function.
  */
 static void
 coeffs_init(struct lp_build_interp_soa_context *bld,
             LLVMValueRef a0_ptr,
             LLVMValueRef dadx_ptr,
             LLVMValueRef dady_ptr)
 {
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    struct lp_build_context *setup_bld = &bld->setup_bld;
    struct gallivm_state *gallivm = coeff_bld->gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef pixoffx, pixoffy;
    unsigned attrib;
    unsigned chan;
    unsigned i;

    pixoffx = coeff_bld->undef;
    pixoffy = coeff_bld->undef;
    for (i = 0; i < coeff_bld->type.length; i++) {
       LLVMValueRef nr = lp_build_const_int32(gallivm, i);
       LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
       LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
       pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
       pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
    }


    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
       LLVMValueRef index = lp_build_const_int32(gallivm,
                                 attrib * TGSI_NUM_CHANNELS);
       LLVMValueRef ptr;
       LLVMValueRef dadxaos = setup_bld->zero;
       LLVMValueRef dadyaos = setup_bld->zero;
       LLVMValueRef a0aos = setup_bld->zero;

       /* always fetch all 4 values for performance/simplicity */
       switch (interp) {
       case LP_INTERP_PERSPECTIVE:
          /* fall-through */

       case LP_INTERP_LINEAR:
          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
          ptr = LLVMBuildBitCast(builder, ptr,
                LLVMPointerType(setup_bld->vec_type, 0), "");
          dadxaos = LLVMBuildLoad(builder, ptr, "");

          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
          ptr = LLVMBuildBitCast(builder, ptr,
                LLVMPointerType(setup_bld->vec_type, 0), "");
          dadyaos = LLVMBuildLoad(builder, ptr, "");

          attrib_name(dadxaos, attrib, 0, ".dadxaos");
          attrib_name(dadyaos, attrib, 0, ".dadyaos");
          /* fall-through */

       case LP_INTERP_CONSTANT:
       case LP_INTERP_FACING:
          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
          ptr = LLVMBuildBitCast(builder, ptr,
                LLVMPointerType(setup_bld->vec_type, 0), "");
          a0aos = LLVMBuildLoad(builder, ptr, "");
          attrib_name(a0aos, attrib, 0, ".a0aos");
          break;

       case LP_INTERP_POSITION:
          /* Nothing to do as the position coeffs are already setup in slot 0 */
          continue;

       default:
          assert(0);
          break;
       }

       /*
        * a = a0 + (x * dadx + y * dady)
        * a0aos is the attrib value at top left corner of stamp
        */
       if (interp != LP_INTERP_CONSTANT &&
           interp != LP_INTERP_FACING) {
          LLVMValueRef axaos, ayaos;
          axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
                                dadxaos, "");
          ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
                                dadyaos, "");
          a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
          a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
       }

       /*
        * dadq = {0, dadx, dady, dadx + dady}
        * for two quads (side by side) this is:
        * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
        */
       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
          /* this generates a CRAPLOAD of shuffles... */
          if (mask & (1 << chan)) {
             LLVMValueRef dadx, dady;
             LLVMValueRef dadq, dadq2;
             LLVMValueRef a;
             LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);

             if (attrib == 0 && chan == 0) {
                a = lp_build_broadcast_scalar(coeff_bld, bld->x);
                dadx = coeff_bld->one;
                dady = coeff_bld->zero;
             }
             else if (attrib == 0 && chan == 1) {
                a = lp_build_broadcast_scalar(coeff_bld, bld->y);
                dady = coeff_bld->one;
                dadx = coeff_bld->zero;
             }
             else {
                dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
                                               coeff_bld->type, dadxaos, chan_index);
                dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
                                               coeff_bld->type, dadyaos, chan_index);

                /*
                 * a = {a, a, a, a}
                 */
                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
                                               coeff_bld->type, a0aos, chan_index);
             }

             dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
             dady = LLVMBuildFMul(builder, dady, pixoffy, "");
             dadq = LLVMBuildFAdd(builder, dadx, dady, "");

             /*
              * Compute the attrib values on the upper-left corner of each
              * group of quads.
              * Note that if we process 2 quads at once this doesn't
              * really exactly to what we want.
              * We need to access elem 0 and 2 respectively later if we process
              * 2 quads at once.
              */

             if (interp != LP_INTERP_CONSTANT &&
                 interp != LP_INTERP_FACING) {
                dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
                a = LLVMBuildFAdd(builder, a, dadq2, "");
 	    }

 #if PERSPECTIVE_DIVIDE_PER_QUAD
             /*
              * a *= 1 / w
              */

             /*
              * XXX since we're only going to access elements 0,2 out of 8
              * if we have 8-wide vectors we should do the division only 4-wide.
              * a is really a 2-elements in a 4-wide vector disguised as 8-wide
              * in this case.
              */
             if (interp == LP_INTERP_PERSPECTIVE) {
                LLVMValueRef w = bld->a[0][3];
                assert(attrib != 0);
                assert(bld->mask[0] & TGSI_WRITEMASK_W);
                if (!bld->oow) {
                   bld->oow = lp_build_rcp(coeff_bld, w);
                   lp_build_name(bld->oow, "oow");
                }
                a = lp_build_mul(coeff_bld, a, bld->oow);
             }
 #endif

             attrib_name(a, attrib, chan, ".a");
             attrib_name(dadq, attrib, chan, ".dadq");

             if (bld->dynamic_offsets) {
                bld->a[attrib][chan] = lp_build_alloca(gallivm,
                                                       LLVMTypeOf(a), "");
                LLVMBuildStore(builder, a, bld->a[attrib][chan]);
             }
             else {
                bld->a[attrib][chan] = a;
             }
             bld->dadq[attrib][chan] = dadq;
          }
       }
    }
 }


 /**
  * Increment the shader input attribute values.
  * This is called when we move from one quad to the next.
  */
 static void
 attribs_update(struct lp_build_interp_soa_context *bld,
                struct gallivm_state *gallivm,
                int quad_start_index,
                LLVMValueRef loop_iter,
                int start,
                int end)
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_start_index);
    LLVMValueRef oow = NULL;
    unsigned attrib;
    unsigned chan;

    assert(quad_start_index < 4);

    for(attrib = start; attrib < end; ++attrib) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
          if(mask & (1 << chan)) {
             LLVMValueRef a;
             if (interp == LP_INTERP_CONSTANT ||
                 interp == LP_INTERP_FACING) {
                a = bld->a[attrib][chan];
                if (bld->dynamic_offsets) {
                   a = LLVMBuildLoad(builder, a, "");
                }
             }
             else if (interp == LP_INTERP_POSITION) {
                assert(attrib > 0);
                a = bld->attribs[0][chan];
             }
             else {
                LLVMValueRef dadq;

                a = bld->a[attrib][chan];

                /*
                 * Broadcast the attribute value for this quad into all elements
                 */

                if (bld->dynamic_offsets) {
                   /* stored as vector load as float */
                   LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
                                                             gallivm->context), 0);
                   LLVMValueRef ptr;
                   a = LLVMBuildBitCast(builder, a, ptr_type, "");
                   ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
                   a = LLVMBuildLoad(builder, ptr, "");
                   a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
                }
                else {
                   a = LLVMBuildShuffleVector(builder,
                                              a, coeff_bld->undef, shuffle, "");
                }

                /*
                 * Get the derivatives.
                 */

                dadq = bld->dadq[attrib][chan];

 #if PERSPECTIVE_DIVIDE_PER_QUAD
                if (interp == LP_INTERP_PERSPECTIVE) {
                   LLVMValueRef dwdq = bld->dadq[0][3];

                   if (oow == NULL) {
                      assert(bld->oow);
                      oow = LLVMBuildShuffleVector(coeff_bld->builder,
                                                   bld->oow, coeff_bld->undef,
                                                   shuffle, "");
                   }

                   dadq = lp_build_sub(coeff_bld,
                                       dadq,
                                       lp_build_mul(coeff_bld, a, dwdq));
                   dadq = lp_build_mul(coeff_bld, dadq, oow);
                }
 #endif

                /*
                 * Add the derivatives
                 */

                a = lp_build_add(coeff_bld, a, dadq);

 #if !PERSPECTIVE_DIVIDE_PER_QUAD
                if (interp == LP_INTERP_PERSPECTIVE) {
                   if (oow == NULL) {
                      LLVMValueRef w = bld->attribs[0][3];
                      assert(attrib != 0);
                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
                      oow = lp_build_rcp(coeff_bld, w);
                   }
                   a = lp_build_mul(coeff_bld, a, oow);
                }
 #endif

                if (attrib == 0 && chan == 2) {
                   /* FIXME: Depth values can exceed 1.0, due to the fact that
                    * setup interpolation coefficients refer to (0,0) which causes
                    * precision loss. So we must clamp to 1.0 here to avoid artifacts
                    */
                   a = lp_build_min(coeff_bld, a, coeff_bld->one);
                }

                attrib_name(a, attrib, chan, "");
             }
             bld->attribs[attrib][chan] = a;
          }
       }
    }
 }


 /**
  * Generate the position vectors.
  *
  * Parameter x0, y0 are the integer values with upper left coordinates.
  */
 static void
 pos_init(struct lp_build_interp_soa_context *bld,
          LLVMValueRef x0,
          LLVMValueRef y0)
 {
    LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
    struct lp_build_context *coeff_bld = &bld->coeff_bld;

    bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
    bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
 }


 /**
  * Initialize fragment shader input attribute info.
  */
 void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          struct gallivm_state *gallivm,
                          unsigned num_inputs,
                          const struct lp_shader_input *inputs,
                          LLVMBuilderRef builder,
                          struct lp_type type,
                          boolean dynamic_offsets,
                          LLVMValueRef a0_ptr,
                          LLVMValueRef dadx_ptr,
                          LLVMValueRef dady_ptr,
                          LLVMValueRef x0,
                          LLVMValueRef y0)
 {
    struct lp_type coeff_type;
    struct lp_type setup_type;
    unsigned attrib;
    unsigned chan;

    memset(bld, 0, sizeof *bld);

    memset(&coeff_type, 0, sizeof coeff_type);
    coeff_type.floating = TRUE;
    coeff_type.sign = TRUE;
    coeff_type.width = 32;
    coeff_type.length = type.length;

    memset(&setup_type, 0, sizeof setup_type);
    setup_type.floating = TRUE;
    setup_type.sign = TRUE;
    setup_type.width = 32;
    setup_type.length = TGSI_NUM_CHANNELS;


    /* XXX: we don't support interpolating into any other types */
    assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);

    lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
    lp_build_context_init(&bld->setup_bld, gallivm, setup_type);

    /* For convenience */
    bld->pos = bld->attribs[0];
    bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];

    /* Position */
    bld->mask[0] = TGSI_WRITEMASK_XYZW;
    bld->interp[0] = LP_INTERP_LINEAR;

    /* Inputs */
    for (attrib = 0; attrib < num_inputs; ++attrib) {
       bld->mask[1 + attrib] = inputs[attrib].usage_mask;
       bld->interp[1 + attrib] = inputs[attrib].interp;
    }
    bld->num_attribs = 1 + num_inputs;

    /* Ensure all masked out input channels have a valid value */
    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
          bld->attribs[attrib][chan] = bld->coeff_bld.undef;
       }
    }

    pos_init(bld, x0, y0);

    if (coeff_type.length > 4) {
       bld->simple_interp = TRUE;
       if (dynamic_offsets) {
          /* XXX this should use a global static table */
          unsigned i;
          unsigned num_loops = 16 / type.length;
          LLVMValueRef pixoffx, pixoffy, index;
          LLVMValueRef ptr;

          bld->dynamic_offsets = TRUE;
          bld->xoffset_store = lp_build_array_alloca(gallivm,
                                                     lp_build_vec_type(gallivm, type),
                                                     lp_build_const_int32(gallivm, num_loops),
                                                     "");
          bld->yoffset_store = lp_build_array_alloca(gallivm,
                                                     lp_build_vec_type(gallivm, type),
                                                     lp_build_const_int32(gallivm, num_loops),
                                                     "");
          for (i = 0; i < num_loops; i++) {
             index = lp_build_const_int32(gallivm, i);
             calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
             ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
             LLVMBuildStore(builder, pixoffx, ptr);
             ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
             LLVMBuildStore(builder, pixoffy, ptr);
          }
       }
       coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
    }
    else {
       bld->simple_interp = FALSE;
       if (dynamic_offsets) {
          bld->dynamic_offsets = TRUE;
       }
       coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
    }

 }


 /**
  * Advance the position and inputs to the given quad within the block.
  */
 void
 lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
                                   struct gallivm_state *gallivm,
                                   int quad_start_index)
 {
    assert(quad_start_index < 4);

    if (bld->simple_interp) {
       attribs_update_simple(bld, gallivm, quad_start_index, NULL, 1, bld->num_attribs);
    }
    else {
       attribs_update(bld, gallivm, quad_start_index, NULL, 1, bld->num_attribs);
    }
 }

 void
 lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
                                struct gallivm_state *gallivm,
                                int quad_start_index)
 {
    assert(quad_start_index < 4);

    if (bld->simple_interp) {
       attribs_update_simple(bld, gallivm, quad_start_index, NULL, 0, 1);
    }
    else {
       attribs_update(bld, gallivm, quad_start_index, NULL, 0, 1);
    }
 }

 void
 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
                                       struct gallivm_state *gallivm,
                                       LLVMValueRef quad_start_index)
 {
    if (bld->simple_interp) {
       attribs_update_simple(bld, gallivm, 0, quad_start_index, 1, bld->num_attribs);
    }
    else {
       attribs_update(bld, gallivm, 0, quad_start_index, 1, bld->num_attribs);
    }
 }

 void
 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
                                    struct gallivm_state *gallivm,
                                    LLVMValueRef quad_start_index)
 {
    if (bld->simple_interp) {
       attribs_update_simple(bld, gallivm, 0, quad_start_index, 0, 1);
    }
    else {
       attribs_update(bld, gallivm, 0, quad_start_index, 0, 1);
    }
 }