src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2014 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * This code is based on original work by Ilia Mirkin.
  */

 /**
  * \file gen6_gs_visitor.cpp
  *
  * Gen6 geometry shader implementation
  */

 #include "gen6_gs_visitor.h"
 #include "brw_eu.h"

 namespace brw {

 void
 gen6_gs_visitor::emit_prolog()
 {
    vec4_gs_visitor::emit_prolog();

    /* Gen6 geometry shaders require to allocate an initial VUE handle via
     * FF_SYNC message, however the documentation remarks that only one thread
     * can write to the URB simultaneously and the FF_SYNC message provides the
     * synchronization mechanism for this, so using this message effectively
     * stalls the thread until it is its turn to write to the URB. Because of
     * this, the best way to implement geometry shader algorithms in gen6 is to
     * execute the algorithm before the FF_SYNC message to maximize parallelism.
     *
     * To achieve this we buffer the geometry shader outputs for each emitted
     * vertex in vertex_output during operation. Then, when we have processed
     * the last vertex (that is, at thread end time), we send the FF_SYNC
     * message to allocate the initial VUE handle and write all buffered vertex
     * data to the URB in one go.
     *
     * For each emitted vertex, vertex_output will hold vue_map.num_slots
     * data items plus one additional item to hold required flags
     * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
     * which come right after the data items for that vertex. Vertex data and
     * flags for the next vertex come right after the data items and flags for
     * the previous vertex.
     */
    this->current_annotation = "gen6 prolog";
    this->vertex_output = src_reg(this,
                                  glsl_type::uint_type,
                                  (prog_data->vue_map.num_slots + 1) *
                                  nir->info->gs.vertices_out);
    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));

    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
     * so initialize it once to R0.
     */
    vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
                                      retype(brw_vec8_grf(0, 0),
                                             BRW_REGISTER_TYPE_UD)));
    inst->force_writemask_all = true;

    /* This will be used as a temporary to store writeback data of FF_SYNC
     * and URB_WRITE messages.
     */
    this->temp = src_reg(this, glsl_type::uint_type);

    /* This will be used to know when we are processing the first vertex of
     * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
     * that we are processing the first vertex in the primitive and to zero
     * otherwise. This way we can use its value directly in the URB write
     * headers.
     */
    this->first_vertex = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));

    /* The FF_SYNC message requires to know the number of primitives generated,
     * so keep a counter for this.
     */
    this->prim_count = src_reg(this, glsl_type::uint_type);
    emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));

    if (prog->info.has_transform_feedback_varyings) {
       /* Create a virtual register to hold destination indices in SOL */
       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
       /* Create a virtual register to hold number of written primitives */
       this->sol_prim_written = src_reg(this, glsl_type::uint_type);
       /* Create a virtual register to hold Streamed Vertex Buffer Indices */
       this->svbi = src_reg(this, glsl_type::uvec4_type);
       /* Create a virtual register to hold max values of SVBI */
       this->max_svbi = src_reg(this, glsl_type::uvec4_type);
       emit(MOV(dst_reg(this->max_svbi),
                src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));

       xfb_setup();
    }

    /* PrimitveID is delivered in r0.1 of the thread payload. If the program
     * needs it we have to move it to a separate register where we can map
     * the atttribute.
     *
     * Notice that we cannot use a virtual register for this, because we need to
     * map all input attributes to hardware registers in setup_payload(),
     * which happens before virtual registers are mapped to hardware registers.
     * We could work around that issue if we were able to compute the first
     * non-payload register here and move the PrimitiveID information to that
     * register, but we can't because at this point we don't know the final
     * number uniforms that will be included in the payload.
     *
     * So, what we do is to place PrimitiveID information in r1, which is always
     * delivered as part of the payload, but its only populated with data
     * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
     * in the 3DSTATE_GS state packet. That information can be obtained by other
     * means though, so we can safely use r1 for this purpose.
     */
    if (gs_prog_data->include_primitive_id) {
       this->primitive_id =
          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
    }
 }

 void
 gen6_gs_visitor::gs_emit_vertex(int stream_id)
 {
    this->current_annotation = "gen6 emit vertex";

    /* Buffer all output slots for this vertex in vertex_output */
    for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
       int varying = prog_data->vue_map.slot_to_varying[slot];
       if (varying != VARYING_SLOT_PSIZ) {
          dst_reg dst(this->vertex_output);
          dst.reladdr = ralloc(mem_ctx, src_reg);
          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
          emit_urb_slot(dst, varying);
       } else {
          /* The PSIZ slot can pack multiple varyings in different channels
           * and emit_urb_slot() will produce a MOV instruction for each of
           * them. Since we are writing to an array, that will translate to
           * possibly multiple MOV instructions with an array destination and
           * each will generate a scratch write with the same offset into
           * scratch space (thus, each one overwriting the previous). This is
           * not what we want. What we will do instead is emit PSIZ to a
           * a regular temporary register, then move that resgister into the
           * array. This way we only have one instruction with an array
           * destination and we only produce a single scratch write.
           */
          dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
          emit_urb_slot(tmp, varying);
          dst_reg dst(this->vertex_output);
          dst.reladdr = ralloc(mem_ctx, src_reg);
          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
          vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
          inst->force_writemask_all = true;
       }

       emit(ADD(dst_reg(this->vertex_output_offset),
                this->vertex_output_offset, brw_imm_ud(1u)));
    }

    /* Now buffer flags for this vertex */
    dst_reg dst(this->vertex_output);
    dst.reladdr = ralloc(mem_ctx, src_reg);
    memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
    if (nir->info->gs.output_primitive == GL_POINTS) {
       /* If we are outputting points, then every vertex has PrimStart and
        * PrimEnd set.
        */
       emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
                               URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
       emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
    } else {
       /* Otherwise, we can only set the PrimStart flag, which we have stored
        * in the first_vertex register. We will have to wait until we execute
        * EndPrimitive() or we end the thread to set the PrimEnd flag on a
        * vertex.
        */
       emit(OR(dst, this->first_vertex,
               brw_imm_ud(gs_prog_data->output_topology <<
                          URB_WRITE_PRIM_TYPE_SHIFT)));
       emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
    }
    emit(ADD(dst_reg(this->vertex_output_offset),
             this->vertex_output_offset, brw_imm_ud(1u)));
 }

 void
 gen6_gs_visitor::gs_end_primitive()
 {
    this->current_annotation = "gen6 end primitive";
    /* Calling EndPrimitive() is optional for point output. In this case we set
     * the PrimEnd flag when we process EmitVertex().
     */
    if (nir->info->gs.output_primitive == GL_POINTS)
       return;

    /* Otherwise we know that the last vertex we have processed was the last
     * vertex in the primitive and we need to set its PrimEnd flag, so do this
     * unless we haven't emitted that vertex at all (vertex_count != 0).
     *
     * Notice that we have already incremented vertex_count when we processed
     * the last emit_vertex, so we need to take that into account in the
     * comparison below (hence the num_output_vertices + 1 in the comparison
     * below).
     */
    unsigned num_output_vertices = nir->info->gs.vertices_out;
    emit(CMP(dst_null_ud(), this->vertex_count,
             brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
    vec4_instruction *inst = emit(CMP(dst_null_ud(),
                                      this->vertex_count, brw_imm_ud(0u),
                                      BRW_CONDITIONAL_NEQ));
    inst->predicate = BRW_PREDICATE_NORMAL;
    emit(IF(BRW_PREDICATE_NORMAL));
    {
       /* vertex_output_offset is already pointing at the first entry of the
        * next vertex. So subtract 1 to modify the flags for the previous
        * vertex.
        */
       src_reg offset(this, glsl_type::uint_type);
       emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));

       src_reg dst(this->vertex_output);
       dst.reladdr = ralloc(mem_ctx, src_reg);
       memcpy(dst.reladdr, &offset, sizeof(src_reg));

       emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
       emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));

       /* Set the first vertex flag to indicate that the next vertex will start
        * a primitive.
        */
       emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
    }
    emit(BRW_OPCODE_ENDIF);
 }

 void
 gen6_gs_visitor::emit_urb_write_header(int mrf)
 {
    this->current_annotation = "gen6 urb header";
    /* Compute offset of the flags for the current vertex in vertex_output and
     * write them in dw2 of the message header.
     *
     * Notice that by the time that emit_thread_end() calls here
     * vertex_output_offset should point to the first data item of the current
     * vertex in vertex_output, thus we only need to add the number of output
     * slots per vertex to that offset to obtain the flags data offset.
     */
    src_reg flags_offset(this, glsl_type::uint_type);
    emit(ADD(dst_reg(flags_offset),
             this->vertex_output_offset,
             brw_imm_d(prog_data->vue_map.num_slots)));

    src_reg flags_data(this->vertex_output);
    flags_data.reladdr = ralloc(mem_ctx, src_reg);
    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));

    emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
 }

 static int
 align_interleaved_urb_mlen(int mlen)
 {
    /* URB data written (does not include the message header reg) must
     * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
     * section 5.4.3.2.2: URB_INTERLEAVED.
     */
    if ((mlen % 2) != 1)
       mlen++;
    return mlen;
 }

 void
 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
                                        int last_mrf, int urb_offset)
 {
    vec4_instruction *inst = NULL;

    if (!complete) {
       /* If the vertex is not complete we don't have to do anything special */
       inst = emit(GS_OPCODE_URB_WRITE);
       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
    } else {
       /* Otherwise we always request to allocate a new VUE handle. If this is
        * the last write before the EOT message and the new handle never gets
        * used it will be dereferenced when we send the EOT message. This is
        * necessary to avoid different setups for the EOT message (one for the
        * case when there is no output and another for the case when there is)
        * which would require to end the program with an IF/ELSE/ENDIF block,
        * something we do not want.
        */
       inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
       inst->dst = dst_reg(MRF, base_mrf);
       inst->src[0] = this->temp;
    }

    inst->base_mrf = base_mrf;
    inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
    inst->offset = urb_offset;
 }

 void
 gen6_gs_visitor::emit_thread_end()
 {
    /* Make sure the current primitive is ended: we know it is not ended when
     * first_vertex is not zero. This is only relevant for outputs other than
     * points because in the point case we set PrimEnd on all vertices.
     */
    if (nir->info->gs.output_primitive != GL_POINTS) {
       emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
       emit(IF(BRW_PREDICATE_NORMAL));
       gs_end_primitive();
       emit(BRW_OPCODE_ENDIF);
    }

    /* Here we have to:
     * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
     * 2) Loop over all buffered vertex data and write it to corresponding
     *    URB entries.
     * 3) Allocate new VUE handles for all vertices other than the first.
     * 4) Send a final EOT message.
     */

    /* MRF 0 is reserved for the debugger, so start with message header
     * in MRF 1.
     */
    int base_mrf = 1;

    /* In the process of generating our URB write message contents, we
     * may need to unspill a register or load from an array.  Those
     * reads would use MRFs 21..23
     */
    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);

    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
    emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
    emit(IF(BRW_PREDICATE_NORMAL));
    {
       this->current_annotation = "gen6 thread end: ff_sync";

       vec4_instruction *inst;
       if (prog->info.has_transform_feedback_varyings) {
          src_reg sol_temp(this, glsl_type::uvec4_type);
          emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
               dst_reg(this->svbi),
               this->vertex_count,
               this->prim_count,
               sol_temp);
          inst = emit(GS_OPCODE_FF_SYNC,
                      dst_reg(this->temp), this->prim_count, this->svbi);
       } else {
          inst = emit(GS_OPCODE_FF_SYNC,
                      dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
       }
       inst->base_mrf = base_mrf;

       /* Loop over all buffered vertices and emit URB write messages */
       this->current_annotation = "gen6 thread end: urb writes init";
       src_reg vertex(this, glsl_type::uint_type);
       emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
       emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));

       this->current_annotation = "gen6 thread end: urb writes";
       emit(BRW_OPCODE_DO);
       {
          emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
          inst = emit(BRW_OPCODE_BREAK);
          inst->predicate = BRW_PREDICATE_NORMAL;

          /* First we prepare the message header */
          emit_urb_write_header(base_mrf);

          /* Then add vertex data to the message in interleaved fashion */
          int slot = 0;
          bool complete = false;
          do {
             int mrf = base_mrf + 1;

             /* URB offset is in URB row increments, and each of our MRFs is half
              * of one of those, since we're doing interleaved writes.
              */
             int urb_offset = slot / 2;

             for (; slot < prog_data->vue_map.num_slots; ++slot) {
                int varying = prog_data->vue_map.slot_to_varying[slot];
                current_annotation = output_reg_annotation[varying];

                /* Compute offset of this slot for the current vertex
                 * in vertex_output
                 */
                src_reg data(this->vertex_output);
                data.reladdr = ralloc(mem_ctx, src_reg);
                memcpy(data.reladdr, &this->vertex_output_offset,
                       sizeof(src_reg));

                /* Copy this slot to the appropriate message register */
                dst_reg reg = dst_reg(MRF, mrf);
                reg.type = output_reg[varying][0].type;
                data.type = reg.type;
                vec4_instruction *inst = emit(MOV(reg, data));
                inst->force_writemask_all = true;

                mrf++;
                emit(ADD(dst_reg(this->vertex_output_offset),
                         this->vertex_output_offset, brw_imm_ud(1u)));

                /* If this was max_usable_mrf, we can't fit anything more into
                 * this URB WRITE. Same if we reached the max. message length.
                 */
                if (mrf > max_usable_mrf ||
                    align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
                   slot++;
                   break;
                }
             }

             complete = slot >= prog_data->vue_map.num_slots;
             emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
          } while (!complete);

          /* Skip over the flags data item so that vertex_output_offset points
           * to the first data item of the next vertex, so that we can start
           * writing the next vertex.
           */
          emit(ADD(dst_reg(this->vertex_output_offset),
                   this->vertex_output_offset, brw_imm_ud(1u)));

          emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
       }
       emit(BRW_OPCODE_WHILE);

       if (prog->info.has_transform_feedback_varyings)
          xfb_write();
    }
    emit(BRW_OPCODE_ENDIF);

    /* Finally, emit EOT message.
     *
     * In gen6 we need to end the thread differently depending on whether we have
     * emitted at least one vertex or not. In case we did, the EOT message must
     * always include the COMPLETE flag or else the GPU hangs. If we have not
     * produced any output we can't use the COMPLETE flag.
     *
     * However, this would lead us to end the program with an ENDIF opcode,
     * which we want to avoid, so what we do is that we always request a new
     * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
     * With this we make sure that whether we have emitted at least one vertex
     * or none at all, we have to finish the thread without writing to the URB,
     * which works for both cases by setting the COMPLETE and UNUSED flags in
     * the EOT message.
     */
    this->current_annotation = "gen6 thread end: EOT";

    if (prog->info.has_transform_feedback_varyings) {
       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
       src_reg data(this, glsl_type::uint_type);
       emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
       emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
       emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
    }

    vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
    inst->base_mrf = base_mrf;
    inst->mlen = 1;
 }

 void
 gen6_gs_visitor::setup_payload()
 {
    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];

    /* Attributes are going to be interleaved, so one register contains two
     * attribute slots.
     */
    int attributes_per_reg = 2;

    /* If a geometry shader tries to read from an input that wasn't written by
     * the vertex shader, that produces undefined results, but it shouldn't
     * crash anything.  So initialize attribute_map to zeros--that ensures that
     * these undefined results are read from r0.
     */
    memset(attribute_map, 0, sizeof(attribute_map));

    int reg = 0;

    /* The payload always contains important data in r0. */
    reg++;

    /* r1 is always part of the payload and it holds information relevant
     * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
     * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
     * information (and move the original value to a virtual register if
     * necessary).
     */
    if (gs_prog_data->include_primitive_id)
       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
    reg++;

    reg = setup_uniforms(reg);

    reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);

    lower_attributes_to_hw_regs(attribute_map, true);

    this->first_non_payload_grf = reg;
 }

 void
 gen6_gs_visitor::xfb_setup()
 {
    static const unsigned swizzle_for_offset[4] = {
       BRW_SWIZZLE4(0, 1, 2, 3),
       BRW_SWIZZLE4(1, 2, 3, 3),
       BRW_SWIZZLE4(2, 3, 3, 3),
       BRW_SWIZZLE4(3, 3, 3, 3)
    };

    const struct gl_transform_feedback_info *linked_xfb_info =
       this->prog->sh.LinkedTransformFeedback;
    int i;

    /* Make sure that the VUE slots won't overflow the unsigned chars in
     * prog_data->transform_feedback_bindings[].
     */
    STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);

    /* Make sure that we don't need more binding table entries than we've
     * set aside for use in transform feedback.  (We shouldn't, since we
     * set aside enough binding table entries to have one per component).
     */
    assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);

    gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
    for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
       gs_prog_data->transform_feedback_bindings[i] =
          linked_xfb_info->Outputs[i].OutputRegister;
       gs_prog_data->transform_feedback_swizzles[i] =
          swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
    }
 }

 void
 gen6_gs_visitor::xfb_write()
 {
    unsigned num_verts;

    if (!gs_prog_data->num_transform_feedback_bindings)
       return;

    switch (gs_prog_data->output_topology) {
    case _3DPRIM_POINTLIST:
       num_verts = 1;
       break;
    case _3DPRIM_LINELIST:
    case _3DPRIM_LINESTRIP:
    case _3DPRIM_LINELOOP:
       num_verts = 2;
       break;
    case _3DPRIM_TRILIST:
    case _3DPRIM_TRIFAN:
    case _3DPRIM_TRISTRIP:
    case _3DPRIM_RECTLIST:
       num_verts = 3;
       break;
    case _3DPRIM_QUADLIST:
    case _3DPRIM_QUADSTRIP:
    case _3DPRIM_POLYGON:
       num_verts = 3;
       break;
    default:
       unreachable("Unexpected primitive type in Gen6 SOL program.");
    }

    this->current_annotation = "gen6 thread end: svb writes init";

    emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
    emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));

    /* Check that at least one primitive can be written
     *
     * Note: since we use the binding table to keep track of buffer offsets
     * and stride, the GS doesn't need to keep track of a separate pointer
     * into each buffer; it uses a single pointer which increments by 1 for
     * each vertex.  So we use SVBI0 for this pointer, regardless of whether
     * transform feedback is in interleaved or separate attribs mode.
     */
    src_reg sol_temp(this, glsl_type::uvec4_type);
    emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));

    /* Compare SVBI calculated number with the maximum value, which is
     * in R1.4 (previously saved in this->max_svbi) for gen6.
     */
    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
    emit(IF(BRW_PREDICATE_NORMAL));
    {
       vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
                                         brw_imm_vf4(brw_float_to_vf(0.0),
                                                     brw_float_to_vf(1.0),
                                                     brw_float_to_vf(2.0),
                                                     brw_float_to_vf(0.0))));
       inst->force_writemask_all = true;

       emit(ADD(dst_reg(this->destination_indices),
                this->destination_indices,
                this->svbi));
    }
    emit(BRW_OPCODE_ENDIF);

    /* Write transform feedback data for all processed vertices. */
    for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) {
       emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
                BRW_CONDITIONAL_L));
       emit(IF(BRW_PREDICATE_NORMAL));
       {
          xfb_program(i, num_verts);
       }
       emit(BRW_OPCODE_ENDIF);
    }
 }

 void
 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
 {
    unsigned binding;
    unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
    src_reg sol_temp(this, glsl_type::uvec4_type);

    /* Check for buffer overflow: we need room to write the complete primitive
     * (all vertices). Otherwise, avoid writing any vertices for it
     */
    emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
    emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
    emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
    emit(IF(BRW_PREDICATE_NORMAL));
    {
       /* Avoid overwriting MRF 1 as it is used as URB write message header */
       dst_reg mrf_reg(MRF, 2);

       this->current_annotation = "gen6: emit SOL vertex data";
       /* For each vertex, generate code to output each varying using the
        * appropriate binding table entry.
        */
       for (binding = 0; binding < num_bindings; ++binding) {
          unsigned char varying =
             gs_prog_data->transform_feedback_bindings[binding];

          /* Set up the correct destination index for this vertex */
          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
                                        mrf_reg,
                                        this->destination_indices);
          inst->sol_vertex = vertex % num_verts;

          /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
           *
           *   "Prior to End of Thread with a URB_WRITE, the kernel must
           *   ensure that all writes are complete by sending the final
           *   write as a committed write."
           */
          bool final_write = binding == (unsigned) num_bindings - 1 &&
                             inst->sol_vertex == num_verts - 1;

          /* Compute offset of this varying for the current vertex
           * in vertex_output
           */
          this->current_annotation = output_reg_annotation[varying];
          src_reg data(this->vertex_output);
          data.reladdr = ralloc(mem_ctx, src_reg);
          int offset = get_vertex_output_offset_for_varying(vertex, varying);
          emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
          data.type = output_reg[varying][0].type;

          /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
           * same slot, so make sure we write the appropriate channel
           */
          if (varying == VARYING_SLOT_PSIZ)
             data.swizzle = BRW_SWIZZLE_WWWW;
          else if (varying == VARYING_SLOT_LAYER)
             data.swizzle = BRW_SWIZZLE_YYYY;
          else if (varying == VARYING_SLOT_VIEWPORT)
             data.swizzle = BRW_SWIZZLE_ZZZZ;
          else
             data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];

          /* Write data */
          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
          inst->sol_binding = binding;
          inst->sol_final_write = final_write;

          if (final_write) {
             /* This is the last vertex of the primitive, then increment
              * SO num primitive counter and destination indices.
              */
             emit(ADD(dst_reg(this->destination_indices),
                      this->destination_indices,
                      brw_imm_ud(num_verts)));
             emit(ADD(dst_reg(this->sol_prim_written),
                      this->sol_prim_written, brw_imm_ud(1u)));
          }

       }
       this->current_annotation = NULL;
    }
    emit(BRW_OPCODE_ENDIF);
 }

 int
 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
 {
    /* Find the output slot assigned to this varying.
     *
     * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
     * as VARYING_SLOT_PSIZ.
     */
    if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
       varying = VARYING_SLOT_PSIZ;
    int slot = prog_data->vue_map.varying_to_slot[varying];

    if (slot < 0) {
       /* This varying does not exist in the VUE so we are not writing to it
        * and its value is undefined. We still want to return a valid offset
        * into vertex_output though, to prevent any out-of-bound accesses into
        * the vertex_output array. Since the value for this varying is undefined
        * we don't really care for the value we assign to it, so any offset
        * within the limits of vertex_output will do.
        */
       slot = 0;
    }

    return vertex * (prog_data->vue_map.num_slots + 1) + slot;
 }

 } /* namespace brw */