src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2011 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "brw_vec4.h"
 #include "brw_cfg.h"
 #include "brw_eu.h"
 #include "brw_program.h"

 namespace brw {

 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
                                    const src_reg &src0, const src_reg &src1,
                                    const src_reg &src2)
 {
    this->opcode = opcode;
    this->dst = dst;
    this->src[0] = src0;
    this->src[1] = src1;
    this->src[2] = src2;
    this->saturate = false;
    this->force_writemask_all = false;
    this->no_dd_clear = false;
    this->no_dd_check = false;
    this->writes_accumulator = false;
    this->conditional_mod = BRW_CONDITIONAL_NONE;
    this->predicate = BRW_PREDICATE_NONE;
    this->predicate_inverse = false;
    this->target = 0;
    this->shadow_compare = false;
    this->ir = NULL;
    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
    this->header_size = 0;
    this->flag_subreg = 0;
    this->mlen = 0;
    this->base_mrf = 0;
    this->offset = 0;
    this->exec_size = 8;
    this->group = 0;
    this->size_written = (dst.file == BAD_FILE ?
                          0 : this->exec_size * type_sz(dst.type));
    this->annotation = NULL;
 }

 vec4_instruction *
 vec4_visitor::emit(vec4_instruction *inst)
 {
    inst->ir = this->base_ir;
    inst->annotation = this->current_annotation;

    this->instructions.push_tail(inst);

    return inst;
 }

 vec4_instruction *
 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
                           vec4_instruction *new_inst)
 {
    new_inst->ir = inst->ir;
    new_inst->annotation = inst->annotation;

    inst->insert_before(block, new_inst);

    return inst;
 }

 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
                    const src_reg &src1, const src_reg &src2)
 {
    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
 }


 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
                    const src_reg &src1)
 {
    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
 }

 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
 {
    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
 }

 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
 {
    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
 }

 vec4_instruction *
 vec4_visitor::emit(enum opcode opcode)
 {
    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
 }

 #define ALU1(op)							\
    vec4_instruction *							\
    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
    {									\
       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
    }

 #define ALU2(op)							\
    vec4_instruction *							\
    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
                     const src_reg &src1)				\
    {									\
       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
                                            src0, src1);                 \
    }

 #define ALU2_ACC(op)							\
    vec4_instruction *							\
    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
                     const src_reg &src1)				\
    {									\
       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
                        BRW_OPCODE_##op, dst, src0, src1);		\
       inst->writes_accumulator = true;                                  \
       return inst;                                                      \
    }

 #define ALU3(op)							\
    vec4_instruction *							\
    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
                     const src_reg &src1, const src_reg &src2)		\
    {									\
       assert(devinfo->gen >= 6);						\
       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
 					   src0, src1, src2);		\
    }

 ALU1(NOT)
 ALU1(MOV)
 ALU1(FRC)
 ALU1(RNDD)
 ALU1(RNDE)
 ALU1(RNDZ)
 ALU1(F32TO16)
 ALU1(F16TO32)
 ALU2(ADD)
 ALU2(MUL)
 ALU2_ACC(MACH)
 ALU2(AND)
 ALU2(OR)
 ALU2(XOR)
 ALU2(DP3)
 ALU2(DP4)
 ALU2(DPH)
 ALU2(SHL)
 ALU2(SHR)
 ALU2(ASR)
 ALU3(LRP)
 ALU1(BFREV)
 ALU3(BFE)
 ALU2(BFI1)
 ALU3(BFI2)
 ALU1(FBH)
 ALU1(FBL)
 ALU1(CBIT)
 ALU3(MAD)
 ALU2_ACC(ADDC)
 ALU2_ACC(SUBB)
 ALU2(MAC)
 ALU1(DIM)

 /** Gen4 predicated IF. */
 vec4_instruction *
 vec4_visitor::IF(enum brw_predicate predicate)
 {
    vec4_instruction *inst;

    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
    inst->predicate = predicate;

    return inst;
 }

 /** Gen6 IF with embedded comparison. */
 vec4_instruction *
 vec4_visitor::IF(src_reg src0, src_reg src1,
                  enum brw_conditional_mod condition)
 {
    assert(devinfo->gen == 6);

    vec4_instruction *inst;

    resolve_ud_negate(&src0);
    resolve_ud_negate(&src1);

    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
 					src0, src1);
    inst->conditional_mod = condition;

    return inst;
 }

 /**
  * CMP: Sets the low bit of the destination channels with the result
  * of the comparison, while the upper bits are undefined, and updates
  * the flag register with the packed 16 bits of the result.
  */
 vec4_instruction *
 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
                   enum brw_conditional_mod condition)
 {
    vec4_instruction *inst;

    /* Take the instruction:
     *
     * CMP null<d> src0<f> src1<f>
     *
     * Original gen4 does type conversion to the destination type before
     * comparison, producing garbage results for floating point comparisons.
     *
     * The destination type doesn't matter on newer generations, so we set the
     * type to match src0 so we can compact the instruction.
     */
    dst.type = src0.type;

    resolve_ud_negate(&src0);
    resolve_ud_negate(&src1);

    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
    inst->conditional_mod = condition;

    return inst;
 }

 vec4_instruction *
 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 {
    vec4_instruction *inst;

    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
 					dst, index);
    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
    inst->mlen = 2;

    return inst;
 }

 vec4_instruction *
 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
                             const src_reg &index)
 {
    vec4_instruction *inst;

    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 					dst, src, index);
    inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
    inst->mlen = 3;

    return inst;
 }

 src_reg
 vec4_visitor::fix_3src_operand(const src_reg &src)
 {
    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
     * able to use vertical stride of zero to replicate the vec4 uniform, like
     *
     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
     *
     * But you can't, since vertical stride is always four in three-source
     * instructions. Instead, insert a MOV instruction to do the replication so
     * that the three-source instruction can consume it.
     */

    /* The MOV is only needed if the source is a uniform or immediate. */
    if (src.file != UNIFORM && src.file != IMM)
       return src;

    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
       return src;

    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
    expanded.type = src.type;
    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
    return src_reg(expanded);
 }

 src_reg
 vec4_visitor::resolve_source_modifiers(const src_reg &src)
 {
    if (!src.abs && !src.negate)
       return src;

    dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
    resolved.type = src.type;
    emit(MOV(resolved, src));

    return src_reg(resolved);
 }

 src_reg
 vec4_visitor::fix_math_operand(const src_reg &src)
 {
    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
       return src;

    /* The gen6 math instruction ignores the source modifiers --
     * swizzle, abs, negate, and at least some parts of the register
     * region description.
     *
     * Rather than trying to enumerate all these cases, *always* expand the
     * operand to a temp GRF for gen6.
     *
     * For gen7, keep the operand as-is, except if immediate, which gen7 still
     * can't use.
     */

    if (devinfo->gen == 7 && src.file != IMM)
       return src;

    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
    expanded.type = src.type;
    emit(MOV(expanded, src));
    return src_reg(expanded);
 }

 vec4_instruction *
 vec4_visitor::emit_math(enum opcode opcode,
                         const dst_reg &dst,
                         const src_reg &src0, const src_reg &src1)
 {
    vec4_instruction *math =
       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));

    if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
       /* MATH on Gen6 must be align1, so we can't do writemasks. */
       math->dst = dst_reg(this, glsl_type::vec4_type);
       math->dst.type = dst.type;
       math = emit(MOV(dst, src_reg(math->dst)));
    } else if (devinfo->gen < 6) {
       math->base_mrf = 1;
       math->mlen = src1.file == BAD_FILE ? 1 : 2;
    }

    return math;
 }

 void
 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
 {
    if (devinfo->gen < 7) {
       unreachable("ir_unop_pack_half_2x16 should be lowered");
    }

    assert(dst.type == BRW_REGISTER_TYPE_UD);
    assert(src0.type == BRW_REGISTER_TYPE_F);

    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
     *
     *   Because this instruction does not have a 16-bit floating-point type,
     *   the destination data type must be Word (W).
     *
     *   The destination must be DWord-aligned and specify a horizontal stride
     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
     *   each destination channel and the upper word is not modified.
     *
     * The above restriction implies that the f32to16 instruction must use
     * align1 mode, because only in align1 mode is it possible to specify
     * horizontal stride.  We choose here to defy the hardware docs and emit
     * align16 instructions.
     *
     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
     * instructions. I was partially successful in that the code passed all
     * tests.  However, the code was dubiously correct and fragile, and the
     * tests were not harsh enough to probe that frailty. Not trusting the
     * code, I chose instead to remain in align16 mode in defiance of the hw
     * docs).
     *
     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
     * simulator, emitting a f32to16 in align16 mode with UD as destination
     * data type is safe. The behavior differs from that specified in the PRM
     * in that the upper word of each destination channel is cleared to 0.
     */

    dst_reg tmp_dst(this, glsl_type::uvec2_type);
    src_reg tmp_src(tmp_dst);

 #if 0
    /* Verify the undocumented behavior on which the following instructions
     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
     * then the result of the bit-or instruction below will be incorrect.
     *
     * You should inspect the disasm output in order to verify that the MOV is
     * not optimized away.
     */
    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
 #endif

    /* Give tmp the form below, where "." means untouched.
     *
     *     w z          y          x w z          y          x
     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
     *
     * That the upper word of each write-channel be 0 is required for the
     * following bit-shift and bit-or instructions to work. Note that this
     * relies on the undocumented hardware behavior mentioned above.
     */
    tmp_dst.writemask = WRITEMASK_XY;
    emit(F32TO16(tmp_dst, src0));

    /* Give the write-channels of dst the form:
     *   0xhhhh0000
     */
    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));

    /* Finally, give the write-channels of dst the form of packHalf2x16's
     * output:
     *   0xhhhhllll
     */
    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
    emit(OR(dst, src_reg(dst), tmp_src));
 }

 void
 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
 {
    if (devinfo->gen < 7) {
       unreachable("ir_unop_unpack_half_2x16 should be lowered");
    }

    assert(dst.type == BRW_REGISTER_TYPE_F);
    assert(src0.type == BRW_REGISTER_TYPE_UD);

    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
     *
     *   Because this instruction does not have a 16-bit floating-point type,
     *   the source data type must be Word (W). The destination type must be
     *   F (Float).
     *
     * To use W as the source data type, we must adjust horizontal strides,
     * which is only possible in align1 mode. All my [chadv] attempts at
     * emitting align1 instructions for unpackHalf2x16 failed to pass the
     * Piglit tests, so I gave up.
     *
     * I've verified that, on gen7 hardware and the simulator, it is safe to
     * emit f16to32 in align16 mode with UD as source data type.
     */

    dst_reg tmp_dst(this, glsl_type::uvec2_type);
    src_reg tmp_src(tmp_dst);

    tmp_dst.writemask = WRITEMASK_X;
    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));

    tmp_dst.writemask = WRITEMASK_Y;
    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));

    dst.writemask = WRITEMASK_XY;
    emit(F16TO32(dst, tmp_src));
 }

 void
 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
 {
    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
     * is not suitable to generate the shift values, but we can use the packed
     * vector float and a type-converting MOV.
     */
    dst_reg shift(this, glsl_type::uvec4_type);
    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));

    dst_reg shifted(this, glsl_type::uvec4_type);
    src0.swizzle = BRW_SWIZZLE_XXXX;
    emit(SHR(shifted, src0, src_reg(shift)));

    shifted.type = BRW_REGISTER_TYPE_UB;
    dst_reg f(this, glsl_type::vec4_type);
    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));

    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
 }

 void
 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
 {
    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
     * is not suitable to generate the shift values, but we can use the packed
     * vector float and a type-converting MOV.
     */
    dst_reg shift(this, glsl_type::uvec4_type);
    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));

    dst_reg shifted(this, glsl_type::uvec4_type);
    src0.swizzle = BRW_SWIZZLE_XXXX;
    emit(SHR(shifted, src0, src_reg(shift)));

    shifted.type = BRW_REGISTER_TYPE_B;
    dst_reg f(this, glsl_type::vec4_type);
    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));

    dst_reg scaled(this, glsl_type::vec4_type);
    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));

    dst_reg max(this, glsl_type::vec4_type);
    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
 }

 void
 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
 {
    dst_reg saturated(this, glsl_type::vec4_type);
    vec4_instruction *inst = emit(MOV(saturated, src0));
    inst->saturate = true;

    dst_reg scaled(this, glsl_type::vec4_type);
    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));

    dst_reg rounded(this, glsl_type::vec4_type);
    emit(RNDE(rounded, src_reg(scaled)));

    dst_reg u(this, glsl_type::uvec4_type);
    emit(MOV(u, src_reg(rounded)));

    src_reg bytes(u);
    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 }

 void
 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
 {
    dst_reg max(this, glsl_type::vec4_type);
    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));

    dst_reg min(this, glsl_type::vec4_type);
    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));

    dst_reg scaled(this, glsl_type::vec4_type);
    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));

    dst_reg rounded(this, glsl_type::vec4_type);
    emit(RNDE(rounded, src_reg(scaled)));

    dst_reg i(this, glsl_type::ivec4_type);
    emit(MOV(i, src_reg(rounded)));

    src_reg bytes(i);
    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
 }

 /*
  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
  * false) elements needed to pack a type.
  */
 static int
 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
 {
    unsigned int i;
    int size;

    switch (type->base_type) {
    case GLSL_TYPE_UINT:
    case GLSL_TYPE_INT:
    case GLSL_TYPE_FLOAT:
    case GLSL_TYPE_BOOL:
    case GLSL_TYPE_DOUBLE:
       if (type->is_matrix()) {
          const glsl_type *col_type = type->column_type();
          unsigned col_slots =
             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
          return type->matrix_columns * col_slots;
       } else {
          /* Regardless of size of vector, it gets a vec4. This is bad
           * packing for things like floats, but otherwise arrays become a
           * mess.  Hopefully a later pass over the code can pack scalars
           * down if appropriate.
           */
          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
       }
    case GLSL_TYPE_ARRAY:
       assert(type->length > 0);
       return type_size_xvec4(type->fields.array, as_vec4) * type->length;
    case GLSL_TYPE_STRUCT:
       size = 0;
       for (i = 0; i < type->length; i++) {
 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
       }
       return size;
    case GLSL_TYPE_SUBROUTINE:
       return 1;

    case GLSL_TYPE_SAMPLER:
       /* Samplers take up no register space, since they're baked in at
        * link time.
        */
       return 0;
    case GLSL_TYPE_ATOMIC_UINT:
       return 0;
    case GLSL_TYPE_IMAGE:
       return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
    case GLSL_TYPE_FUNCTION:
       unreachable("not reached");
    }

    return 0;
 }

 /**
  * Returns the minimum number of vec4 elements needed to pack a type.
  *
  * For simple types, it will return 1 (a single vec4); for matrices, the
  * number of columns; for array and struct, the sum of the vec4_size of
  * each of its elements; and for sampler and atomic, zero.
  *
  * This method is useful to calculate how much register space is needed to
  * store a particular type.
  */
 extern "C" int
 type_size_vec4(const struct glsl_type *type)
 {
    return type_size_xvec4(type, true);
 }

 /**
  * Returns the minimum number of dvec4 elements needed to pack a type.
  *
  * For simple types, it will return 1 (a single dvec4); for matrices, the
  * number of columns; for array and struct, the sum of the dvec4_size of
  * each of its elements; and for sampler and atomic, zero.
  *
  * This method is useful to calculate how much register space is needed to
  * store a particular type.
  *
  * Measuring double-precision vertex inputs as dvec4 is required because
  * ARB_vertex_attrib_64bit states that these uses the same number of locations
  * than the single-precision version. That is, two consecutives dvec4 would be
  * located in location "x" and location "x+1", not "x+2".
  *
  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
  * remap_vs_attrs() will take in account both the location and also if the
  * type fits in one or two vec4 slots.
  */
 extern "C" int
 type_size_dvec4(const struct glsl_type *type)
 {
    return type_size_xvec4(type, false);
 }

 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 {
    init();

    this->file = VGRF;
    this->nr = v->alloc.allocate(type_size_vec4(type));

    if (type->is_array() || type->is_record()) {
       this->swizzle = BRW_SWIZZLE_NOOP;
    } else {
       this->swizzle = brw_swizzle_for_size(type->vector_elements);
    }

    this->type = brw_type_for_base_type(type);
 }

 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 {
    assert(size > 0);

    init();

    this->file = VGRF;
    this->nr = v->alloc.allocate(type_size_vec4(type) * size);

    this->swizzle = BRW_SWIZZLE_NOOP;

    this->type = brw_type_for_base_type(type);
 }

 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 {
    init();

    this->file = VGRF;
    this->nr = v->alloc.allocate(type_size_vec4(type));

    if (type->is_array() || type->is_record()) {
       this->writemask = WRITEMASK_XYZW;
    } else {
       this->writemask = (1 << type->vector_elements) - 1;
    }

    this->type = brw_type_for_base_type(type);
 }

 vec4_instruction *
 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
                           src_reg src0, src_reg src1)
 {
    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
    inst->conditional_mod = conditionalmod;
    return inst;
 }

 vec4_instruction *
 vec4_visitor::emit_lrp(const dst_reg &dst,
                        const src_reg &x, const src_reg &y, const src_reg &a)
 {
    if (devinfo->gen >= 6) {
       /* Note that the instruction's argument order is reversed from GLSL
        * and the IR.
        */
      return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
                      fix_3src_operand(x)));
    } else {
       /* Earlier generations don't support three source operations, so we
        * need to emit x*(1-a) + y*a.
        */
       dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
       dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
       dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
       y_times_a.writemask           = dst.writemask;
       one_minus_a.writemask         = dst.writemask;
       x_times_one_minus_a.writemask = dst.writemask;

       emit(MUL(y_times_a, y, a));
       emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
       return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
    }
 }

 /**
  * Emits the instructions needed to perform a pull constant load. before_block
  * and before_inst can be NULL in which case the instruction will be appended
  * to the end of the instruction list.
  */
 void
 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
                                           src_reg surf_index,
                                           src_reg offset_reg,
                                           bblock_t *before_block,
                                           vec4_instruction *before_inst)
 {
    assert((before_inst == NULL && before_block == NULL) ||
           (before_inst && before_block));

    vec4_instruction *pull;

    if (devinfo->gen >= 9) {
       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
       src_reg header(this, glsl_type::uvec4_type, 2);

       pull = new(mem_ctx)
          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
                           dst_reg(header));

       if (before_inst)
          emit_before(before_block, before_inst, pull);
       else
          emit(pull);

       dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
                                  offset_reg.type);
       pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);

       if (before_inst)
          emit_before(before_block, before_inst, pull);
       else
          emit(pull);

       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
                                            dst,
                                            surf_index,
                                            header);
       pull->mlen = 2;
       pull->header_size = 1;
    } else if (devinfo->gen >= 7) {
       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);

       grf_offset.type = offset_reg.type;

       pull = MOV(grf_offset, offset_reg);

       if (before_inst)
          emit_before(before_block, before_inst, pull);
       else
          emit(pull);

       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
                                            dst,
                                            surf_index,
                                            src_reg(grf_offset));
       pull->mlen = 1;
    } else {
       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
                                            dst,
                                            surf_index,
                                            offset_reg);
       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
       pull->mlen = 1;
    }

    if (before_inst)
       emit_before(before_block, before_inst, pull);
    else
       emit(pull);
 }

 src_reg
 vec4_visitor::emit_uniformize(const src_reg &src)
 {
    const src_reg chan_index(this, glsl_type::uint_type);
    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
                               src.type);

    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
       ->force_writemask_all = true;
    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
       ->force_writemask_all = true;

    return src_reg(dst);
 }

 src_reg
 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
                              src_reg coordinate, src_reg surface)
 {
    vec4_instruction *inst =
       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
                                     dst_reg(this, glsl_type::uvec4_type));
    inst->base_mrf = 2;
    inst->src[1] = surface;
    inst->src[2] = surface;

    int param_base;

    if (devinfo->gen >= 9) {
       /* Gen9+ needs a message header in order to use SIMD4x2 mode */
       vec4_instruction *header_inst = new(mem_ctx)
          vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
                           dst_reg(MRF, inst->base_mrf));

       emit(header_inst);

       inst->mlen = 2;
       inst->header_size = 1;
       param_base = inst->base_mrf + 1;
    } else {
       inst->mlen = 1;
       param_base = inst->base_mrf;
    }

    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
    int zero_mask = 0xf & ~coord_mask;

    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
             coordinate));

    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
             brw_imm_d(0)));

    emit(inst);
    return src_reg(inst->dst);
 }

 bool
 vec4_visitor::is_high_sampler(src_reg sampler)
 {
    if (devinfo->gen < 8 && !devinfo->is_haswell)
       return false;

    return sampler.file != IMM || sampler.ud >= 16;
 }

 void
 vec4_visitor::emit_texture(ir_texture_opcode op,
                            dst_reg dest,
                            const glsl_type *dest_type,
                            src_reg coordinate,
                            int coord_components,
                            src_reg shadow_comparator,
                            src_reg lod, src_reg lod2,
                            src_reg sample_index,
                            uint32_t constant_offset,
                            src_reg offset_value,
                            src_reg mcs,
                            uint32_t surface,
                            src_reg surface_reg,
                            src_reg sampler_reg)
 {
    /* The sampler can only meaningfully compute LOD for fragment shader
     * messages. For all other stages, we change the opcode to TXL and hardcode
     * the LOD to 0.
     *
     * textureQueryLevels() is implemented in terms of TXS so we need to pass a
     * valid LOD argument.
     */
    if (op == ir_tex || op == ir_query_levels) {
       assert(lod.file == BAD_FILE);
       lod = brw_imm_f(0.0f);
    }

    enum opcode opcode;
    switch (op) {
    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
    case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
                              SHADER_OPCODE_TXF_CMS); break;
    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
    case ir_tg4: opcode = offset_value.file != BAD_FILE
                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
    case ir_txb:
       unreachable("TXB is not valid for vertex shaders.");
    case ir_lod:
       unreachable("LOD is not valid for vertex shaders.");
    case ir_samples_identical: {
       /* There are some challenges implementing this for vec4, and it seems
        * unlikely to be used anyway.  For now, just return false ways.
        */
       emit(MOV(dest, brw_imm_ud(0u)));
       return;
    }
    default:
       unreachable("Unrecognized tex op");
    }

    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);

    inst->offset = constant_offset;

    /* The message header is necessary for:
     * - Gen4 (always)
     * - Gen9+ for selecting SIMD4x2
     * - Texel offsets
     * - Gather channel selection
     * - Sampler indices too large to fit in a 4-bit value.
     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
     */
    inst->header_size =
       (devinfo->gen < 5 || devinfo->gen >= 9 ||
        inst->offset != 0 || op == ir_tg4 ||
        op == ir_texture_samples ||
        is_high_sampler(sampler_reg)) ? 1 : 0;
    inst->base_mrf = 2;
    inst->mlen = inst->header_size;
    inst->dst.writemask = WRITEMASK_XYZW;
    inst->shadow_compare = shadow_comparator.file != BAD_FILE;

    inst->src[1] = surface_reg;
    inst->src[2] = sampler_reg;

    /* MRF for the first parameter */
    int param_base = inst->base_mrf + inst->header_size;

    if (op == ir_txs || op == ir_query_levels) {
       int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
       inst->mlen++;
    } else if (op == ir_texture_samples) {
       inst->dst.writemask = WRITEMASK_X;
    } else {
       /* Load the coordinate */
       /* FINISHME: gl_clamp_mask and saturate */
       int coord_mask = (1 << coord_components) - 1;
       int zero_mask = 0xf & ~coord_mask;

       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
                coordinate));
       inst->mlen++;

       if (zero_mask != 0) {
          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
                   brw_imm_d(0)));
       }
       /* Load the shadow comparator */
       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
 			  WRITEMASK_X),
 		  shadow_comparator));
 	 inst->mlen++;
       }

       /* Load the LOD info */
       if (op == ir_tex || op == ir_txl) {
 	 int mrf, writemask;
 	 if (devinfo->gen >= 5) {
 	    mrf = param_base + 1;
 	    if (shadow_comparator.file != BAD_FILE) {
 	       writemask = WRITEMASK_Y;
 	       /* mlen already incremented */
 	    } else {
 	       writemask = WRITEMASK_X;
 	       inst->mlen++;
 	    }
 	 } else /* devinfo->gen == 4 */ {
 	    mrf = param_base;
 	    writemask = WRITEMASK_W;
 	 }
 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
       } else if (op == ir_txf) {
          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
       } else if (op == ir_txf_ms) {
          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
                   sample_index));
          if (opcode == SHADER_OPCODE_TXF_CMS_W) {
             /* MCS data is stored in the first two channels of ‘mcs’, but we
              * need to get it into the .y and .z channels of the second vec4
              * of params.
              */
             mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
             emit(MOV(dst_reg(MRF, param_base + 1,
                              glsl_type::uint_type, WRITEMASK_YZ),
                      mcs));
          } else if (devinfo->gen >= 7) {
             /* MCS data is in the first channel of `mcs`, but we need to get it into
              * the .y channel of the second vec4 of params, so replicate .x across
              * the whole vec4 and then mask off everything except .y
              */
             mcs.swizzle = BRW_SWIZZLE_XXXX;
             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
                      mcs));
          }
          inst->mlen++;
       } else if (op == ir_txd) {
          const brw_reg_type type = lod.type;

 	 if (devinfo->gen >= 5) {
 	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
 	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
 	    inst->mlen++;

 	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
 	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
 	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
 	       inst->mlen++;

                if (shadow_comparator.file != BAD_FILE) {
                   emit(MOV(dst_reg(MRF, param_base + 2,
                                    shadow_comparator.type, WRITEMASK_Z),
                            shadow_comparator));
                }
 	    }
 	 } else /* devinfo->gen == 4 */ {
 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
 	    inst->mlen += 2;
 	 }
       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
          if (shadow_comparator.file != BAD_FILE) {
             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
                      shadow_comparator));
          }

          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
                   offset_value));
          inst->mlen++;
       }
    }

    emit(inst);

    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
     * spec requires layers.
     */
    if (op == ir_txs && devinfo->gen < 7) {
       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
                   src_reg(inst->dst), brw_imm_d(1));
    }

    if (devinfo->gen == 6 && op == ir_tg4) {
       emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
    }

    if (op == ir_query_levels) {
       /* # levels is in .w */
       src_reg swizzled(dest);
       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
                                       SWIZZLE_W, SWIZZLE_W);
       emit(MOV(dest, swizzled));
    }
 }

 /**
  * Apply workarounds for Gen6 gather with UINT/SINT
  */
 void
 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
 {
    if (!wa)
       return;

    int width = (wa & WA_8BIT) ? 8 : 16;
    dst_reg dst_f = dst;
    dst_f.type = BRW_REGISTER_TYPE_F;

    /* Convert from UNORM to UINT */
    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
    emit(MOV(dst, src_reg(dst_f)));

    if (wa & WA_SIGN) {
       /* Reinterpret the UINT value as a signed INT value by
        * shifting the sign bit into place, then shifting back
        * preserving sign.
        */
       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
    }
 }

 void
 vec4_visitor::gs_emit_vertex(int /* stream_id */)
 {
    unreachable("not reached");
 }

 void
 vec4_visitor::gs_end_primitive()
 {
    unreachable("not reached");
 }

 void
 vec4_visitor::emit_ndc_computation()
 {
    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
       return;

    /* Get the position */
    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);

    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;

    current_annotation = "NDC";
    dst_reg ndc_w = ndc;
    ndc_w.writemask = WRITEMASK_W;
    src_reg pos_w = pos;
    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);

    dst_reg ndc_xyz = ndc;
    ndc_xyz.writemask = WRITEMASK_XYZ;

    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
 }

 void
 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
 {
    if (devinfo->gen < 6 &&
        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
         devinfo->has_negative_rhw_bug)) {
       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
       dst_reg header1_w = header1;
       header1_w.writemask = WRITEMASK_W;

       emit(MOV(header1, brw_imm_ud(0u)));

       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);

 	 current_annotation = "Point size";
 	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
 	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
       }

       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
          current_annotation = "Clipping flags";
          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);

          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));

          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
       }

       /* i965 clipping workaround:
        * 1) Test for -ve rhw
        * 2) If set,
        *      set ndc = (0,0,0,0)
        *      set ucp[6] = 1
        *
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
       if (devinfo->has_negative_rhw_bug &&
           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
          vec4_instruction *inst;
          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
          inst->predicate = BRW_PREDICATE_NORMAL;
          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
          inst->predicate = BRW_PREDICATE_NORMAL;
       }

       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
    } else if (devinfo->gen < 6) {
       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
    } else {
       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
          dst_reg reg_w = reg;
          reg_w.writemask = WRITEMASK_W;
          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
          reg_as_src.type = reg_w.type;
          reg_as_src.swizzle = brw_swizzle_for_size(1);
          emit(MOV(reg_w, reg_as_src));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
          dst_reg reg_y = reg;
          reg_y.writemask = WRITEMASK_Y;
          reg_y.type = BRW_REGISTER_TYPE_D;
          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
          dst_reg reg_z = reg;
          reg_z.writemask = WRITEMASK_Z;
          reg_z.type = BRW_REGISTER_TYPE_D;
          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
       }
    }
 }

 vec4_instruction *
 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
 {
    assert(varying < VARYING_SLOT_MAX);

    unsigned num_comps = output_num_components[varying][component];
    if (num_comps == 0)
       return NULL;

    assert(output_reg[varying][component].type == reg.type);
    current_annotation = output_reg_annotation[varying];
    if (output_reg[varying][component].file != BAD_FILE) {
       src_reg src = src_reg(output_reg[varying][component]);
       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
       reg.writemask =
          brw_writemask_for_component_packing(num_comps, component);
       return emit(MOV(reg, src));
    }
    return NULL;
 }

 void
 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
 {
    reg.type = BRW_REGISTER_TYPE_F;
    output_reg[varying][0].type = reg.type;

    switch (varying) {
    case VARYING_SLOT_PSIZ:
    {
       /* PSIZ is always in slot 0, and is coupled with other flags. */
       current_annotation = "indices, point width, clip flags";
       emit_psiz_and_flags(reg);
       break;
    }
    case BRW_VARYING_SLOT_NDC:
       current_annotation = "NDC";
       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
       break;
    case VARYING_SLOT_POS:
       current_annotation = "gl_Position";
       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
       break;
    case VARYING_SLOT_EDGE:
       /* This is present when doing unfilled polygons.  We're supposed to copy
        * the edge flag from the user-provided vertex array
        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
        * of that attribute (starts as 1.0f).  This is then used in clipping to
        * determine which edges should be drawn as wireframe.
        */
       current_annotation = "edge flag";
       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
                                     glsl_type::float_type, WRITEMASK_XYZW))));
       break;
    case BRW_VARYING_SLOT_PAD:
       /* No need to write to this slot */
       break;
    default:
       for (int i = 0; i < 4; i++) {
          emit_generic_urb_slot(reg, varying, i);
       }
       break;
    }
 }

 static int
 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
 {
    if (devinfo->gen >= 6) {
       /* URB data written (does not include the message header reg) must
        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
        * section 5.4.3.2.2: URB_INTERLEAVED.
        *
        * URB entries are allocated on a multiple of 1024 bits, so an
        * extra 128 bits written here to make the end align to 256 is
        * no problem.
        */
       if ((mlen % 2) != 1)
 	 mlen++;
    }

    return mlen;
 }


 /**
  * Generates the VUE payload plus the necessary URB write instructions to
  * output it.
  *
  * The VUE layout is documented in Volume 2a.
  */
 void
 vec4_visitor::emit_vertex()
 {
    /* MRF 0 is reserved for the debugger, so start with message header
     * in MRF 1.
     */
    int base_mrf = 1;
    int mrf = base_mrf;
    /* In the process of generating our URB write message contents, we
     * may need to unspill a register or load from an array.  Those
     * reads would use MRFs 14-15.
     */
    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);

    /* The following assertion verifies that max_usable_mrf causes an
     * even-numbered amount of URB write data, which will meet gen6's
     * requirements for length alignment.
     */
    assert ((max_usable_mrf - base_mrf) % 2 == 0);

    /* First mrf is the g0-based message header containing URB handles and
     * such.
     */
    emit_urb_write_header(mrf++);

    if (devinfo->gen < 6) {
       emit_ndc_computation();
    }

    /* We may need to split this up into several URB writes, so do them in a
     * loop.
     */
    int slot = 0;
    bool complete = false;
    do {
       /* URB offset is in URB row increments, and each of our MRFs is half of
        * one of those, since we're doing interleaved writes.
        */
       int offset = slot / 2;

       mrf = base_mrf + 1;
       for (; slot < prog_data->vue_map.num_slots; ++slot) {
          emit_urb_slot(dst_reg(MRF, mrf++),
                        prog_data->vue_map.slot_to_varying[slot]);

          /* If this was max_usable_mrf, we can't fit anything more into this
           * URB WRITE. Same thing if we reached the maximum length available.
           */
          if (mrf > max_usable_mrf ||
              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
             slot++;
             break;
          }
       }

       complete = slot >= prog_data->vue_map.num_slots;
       current_annotation = "URB write";
       vec4_instruction *inst = emit_urb_write_opcode(complete);
       inst->base_mrf = base_mrf;
       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
       inst->offset += offset;
    } while(!complete);
 }


 src_reg
 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
 				 src_reg *reladdr, int reg_offset)
 {
    /* Because we store the values to scratch interleaved like our
     * vertex data, we need to scale the vec4 index by 2.
     */
    int message_header_scale = 2;

    /* Pre-gen6, the message header uses byte offsets instead of vec4
     * (16-byte) offset units.
     */
    if (devinfo->gen < 6)
       message_header_scale *= 16;

    if (reladdr) {
       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
        * to multiply the reladdr by 2. Notice that the reg_offset part
        * is in units of 16 bytes and is used to select the low/high 16-byte
        * chunk of a full dvec4, so we don't want to multiply that part.
        */
       src_reg index = src_reg(this, glsl_type::int_type);
       if (type_sz(inst->dst.type) < 8) {
          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
                                       brw_imm_d(reg_offset)));
          emit_before(block, inst, MUL(dst_reg(index), index,
                                       brw_imm_d(message_header_scale)));
       } else {
          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
                                       brw_imm_d(message_header_scale * 2)));
          emit_before(block, inst, ADD(dst_reg(index), index,
                                       brw_imm_d(reg_offset * message_header_scale)));
       }
       return index;
    } else {
       return brw_imm_d(reg_offset * message_header_scale);
    }
 }

 /**
  * Emits an instruction before @inst to load the value named by @orig_src
  * from scratch space at @base_offset to @temp.
  *
  * @base_offset is measured in 32-byte units (the size of a register).
  */
 void
 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
 				dst_reg temp, src_reg orig_src,
 				int base_offset)
 {
    assert(orig_src.offset % REG_SIZE == 0);
    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
                                       reg_offset);

    if (type_sz(orig_src.type) < 8) {
       emit_before(block, inst, SCRATCH_READ(temp, index));
    } else {
       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
       vec4_instruction *last_read =
          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
       emit_before(block, inst, last_read);
       shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
    }
 }

 /**
  * Emits an instruction after @inst to store the value to be written
  * to @orig_dst to scratch space at @base_offset, from @temp.
  *
  * @base_offset is measured in 32-byte units (the size of a register).
  */
 void
 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
                                  int base_offset)
 {
    assert(inst->dst.offset % REG_SIZE == 0);
    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
                                       reg_offset);

    /* Create a temporary register to store *inst's result in.
     *
     * We have to be careful in MOVing from our temporary result register in
     * the scratch write.  If we swizzle from channels of the temporary that
     * weren't initialized, it will confuse live interval analysis, which will
     * make spilling fail to make progress.
     */
    bool is_64bit = type_sz(inst->dst.type) == 8;
    const glsl_type *alloc_type =
       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
                                        inst->dst.type),
                                 brw_swizzle_for_mask(inst->dst.writemask));

    if (!is_64bit) {
       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
 				          inst->dst.writemask));
       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
       if (inst->opcode != BRW_OPCODE_SEL)
          write->predicate = inst->predicate;
       write->ir = inst->ir;
       write->annotation = inst->annotation;
       inst->insert_after(block, write);
    } else {
       dst_reg shuffled = dst_reg(this, alloc_type);
       vec4_instruction *last =
          shuffle_64bit_data(shuffled, temp, true, block, inst);
       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));

       uint8_t mask = 0;
       if (inst->dst.writemask & WRITEMASK_X)
          mask |= WRITEMASK_XY;
       if (inst->dst.writemask & WRITEMASK_Y)
          mask |= WRITEMASK_ZW;
       if (mask) {
          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));

          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
          if (inst->opcode != BRW_OPCODE_SEL)
             write->predicate = inst->predicate;
          write->ir = inst->ir;
          write->annotation = inst->annotation;
          last->insert_after(block, write);
       }

       mask = 0;
       if (inst->dst.writemask & WRITEMASK_Z)
          mask |= WRITEMASK_XY;
       if (inst->dst.writemask & WRITEMASK_W)
          mask |= WRITEMASK_ZW;
       if (mask) {
          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));

          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
                                             reg_offset + 1);
          vec4_instruction *write =
             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
          if (inst->opcode != BRW_OPCODE_SEL)
             write->predicate = inst->predicate;
          write->ir = inst->ir;
          write->annotation = inst->annotation;
          last->insert_after(block, write);
       }
    }

    inst->dst.file = temp.file;
    inst->dst.nr = temp.nr;
    inst->dst.offset %= REG_SIZE;
    inst->dst.reladdr = NULL;
 }

 /**
  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
  * adds the scratch read(s) before \p inst. The function also checks for
  * recursive reladdr scratch accesses, issuing the corresponding scratch
  * loads and rewriting reladdr references accordingly.
  *
  * \return \p src if it did not require a scratch load, otherwise, the
  * register holding the result of the scratch load that the caller should
  * use to rewrite src.
  */
 src_reg
 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
                                    vec4_instruction *inst, src_reg src)
 {
    /* Resolve recursive reladdr scratch access by calling ourselves
     * with src.reladdr
     */
    if (src.reladdr)
       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
                                           *src.reladdr);

    /* Now handle scratch access on src */
    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
          glsl_type::dvec4_type : glsl_type::vec4_type);
       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
       src.nr = temp.nr;
       src.offset %= REG_SIZE;
       src.reladdr = NULL;
    }

    return src;
 }

 /**
  * We can't generally support array access in GRF space, because a
  * single instruction's destination can only span 2 contiguous
  * registers.  So, we send all GRF arrays that get variable index
  * access to scratch space.
  */
 void
 vec4_visitor::move_grf_array_access_to_scratch()
 {
    int scratch_loc[this->alloc.count];
    memset(scratch_loc, -1, sizeof(scratch_loc));

    /* First, calculate the set of virtual GRFs that need to be punted
     * to scratch due to having any array access on them, and where in
     * scratch.
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       if (inst->dst.file == VGRF && inst->dst.reladdr) {
          if (scratch_loc[inst->dst.nr] == -1) {
             scratch_loc[inst->dst.nr] = last_scratch;
             last_scratch += this->alloc.sizes[inst->dst.nr];
          }

          for (src_reg *iter = inst->dst.reladdr;
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
                scratch_loc[iter->nr] = last_scratch;
                last_scratch += this->alloc.sizes[iter->nr];
             }
          }
       }

       for (int i = 0 ; i < 3; i++) {
          for (src_reg *iter = &inst->src[i];
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
                scratch_loc[iter->nr] = last_scratch;
                last_scratch += this->alloc.sizes[iter->nr];
             }
          }
       }
    }

    /* Now, for anything that will be accessed through scratch, rewrite
     * it to load/store.  Note that this is a _safe list walk, because
     * we may generate a new scratch_write instruction after the one
     * we're processing.
     */
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
       /* Set up the annotation tracking for new generated instructions. */
       base_ir = inst->ir;
       current_annotation = inst->annotation;

       /* First handle scratch access on the dst. Notice we have to handle
        * the case where the dst's reladdr also points to scratch space.
        */
       if (inst->dst.reladdr)
          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
                                                    *inst->dst.reladdr);

       /* Now that we have handled any (possibly recursive) reladdr scratch
        * accesses for dst we can safely do the scratch write for dst itself
        */
       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);

       /* Now handle scratch access on any src. In this case, since inst->src[i]
        * already is a src_reg, we can just call emit_resolve_reladdr with
        * inst->src[i] and it will take care of handling scratch loads for
        * both src and src.reladdr (recursively).
        */
       for (int i = 0 ; i < 3; i++) {
          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
                                              inst->src[i]);
       }
    }
 }

 /**
  * Emits an instruction before @inst to load the value named by @orig_src
  * from the pull constant buffer (surface) at @base_offset to @temp.
  */
 void
 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
 				      dst_reg temp, src_reg orig_src,
                                       int base_offset, src_reg indirect)
 {
    assert(orig_src.offset % 16 == 0);
    int reg_offset = base_offset + orig_src.offset / 16;
    const unsigned index = prog_data->base.binding_table.pull_constants_start;

    src_reg offset;
    if (indirect.file != BAD_FILE) {
       offset = src_reg(this, glsl_type::uint_type);

       emit_before(block, inst, ADD(dst_reg(offset), indirect,
                                    brw_imm_ud(reg_offset * 16)));
    } else if (devinfo->gen >= 8) {
       /* Store the offset in a GRF so we can send-from-GRF. */
       offset = src_reg(this, glsl_type::uint_type);
       emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
    } else {
       offset = brw_imm_d(reg_offset * 16);
    }

    emit_pull_constant_load_reg(temp,
                                brw_imm_ud(index),
                                offset,
                                block, inst);

    brw_mark_surface_used(&prog_data->base, index);
 }

 /**
  * Implements array access of uniforms by inserting a
  * PULL_CONSTANT_LOAD instruction.
  *
  * Unlike temporary GRF array access (where we don't support it due to
  * the difficulty of doing relative addressing on instruction
  * destinations), we could potentially do array access of uniforms
  * that were loaded in GRF space as push constants.  In real-world
  * usage we've seen, though, the arrays being used are always larger
  * than we could load as push constants, so just always move all
  * uniform array access out to a pull constant buffer.
  */
 void
 vec4_visitor::move_uniform_array_access_to_pull_constants()
 {
    /* The vulkan dirver doesn't support pull constants other than UBOs so
     * everything has to be pushed regardless.
     */
    if (stage_prog_data->pull_param == NULL) {
       split_uniform_registers();
       return;
    }

    int pull_constant_loc[this->uniforms];
    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));

    /* First, walk through the instructions and determine which things need to
     * be pulled.  We mark something as needing to be pulled by setting
     * pull_constant_loc to 0.
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       /* We only care about MOV_INDIRECT of a uniform */
       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
           inst->src[0].file != UNIFORM)
          continue;

       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;

       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
          pull_constant_loc[uniform_nr + j] = 0;
    }

    /* Next, we walk the list of uniforms and assign real pull constant
     * locations and set their corresponding entries in pull_param.
     */
    for (int j = 0; j < this->uniforms; j++) {
       if (pull_constant_loc[j] < 0)
          continue;

       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;

       for (int i = 0; i < 4; i++) {
          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
             = stage_prog_data->param[j * 4 + i];
       }
    }

    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
     * instructions to actual uniform pulls.
     */
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
       /* We only care about MOV_INDIRECT of a uniform */
       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
           inst->src[0].file != UNIFORM)
          continue;

       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;

       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);

       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
                               pull_constant_loc[uniform_nr], inst->src[1]);
       inst->remove(block);
    }

    /* Now there are no accesses of the UNIFORM file with a reladdr, so
     * no need to track them as larger-than-vec4 objects.  This will be
     * relied on in cutting out unused uniform vectors from push
     * constants.
     */
    split_uniform_registers();
 }

 void
 vec4_visitor::resolve_ud_negate(src_reg *reg)
 {
    if (reg->type != BRW_REGISTER_TYPE_UD ||
        !reg->negate)
       return;

    src_reg temp = src_reg(this, glsl_type::uvec4_type);
    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
    *reg = temp;
 }

 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
                            void *log_data,
                            const struct brw_sampler_prog_key_data *key_tex,
                            struct brw_vue_prog_data *prog_data,
                            const nir_shader *shader,
 			   void *mem_ctx,
                            bool no_spills,
                            int shader_time_index)
    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
      key_tex(key_tex),
      prog_data(prog_data),
      fail_msg(NULL),
      first_non_payload_grf(0),
      need_all_constants_in_pull_buffer(false),
      no_spills(no_spills),
      shader_time_index(shader_time_index),
      last_scratch(0)
 {
    this->failed = false;

    this->base_ir = NULL;
    this->current_annotation = NULL;
    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));

    memset(this->output_num_components, 0, sizeof(this->output_num_components));

    this->virtual_grf_start = NULL;
    this->virtual_grf_end = NULL;
    this->live_intervals = NULL;

    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;

    this->uniforms = 0;
 }

 vec4_visitor::~vec4_visitor()
 {
 }


 void
 vec4_visitor::fail(const char *format, ...)
 {
    va_list va;
    char *msg;

    if (failed)
       return;

    failed = true;

    va_start(va, format);
    msg = ralloc_vasprintf(mem_ctx, format, va);
    va_end(va);
    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);

    this->fail_msg = msg;

    if (debug_enabled) {
       fprintf(stderr, "%s",  msg);
    }
 }

 } /* namespace brw */