gallium: Add a nir-to-TGSI pass.

The goal is to replace glsl_to_tgsi.cpp and its supporting code (~10k
LOC).  This code ends up being smaller because NIR has many lowering
passes that help it map better to TGSI than GLSL IR does.

As a benefit, this brings NIR optimizations to TGSI-only drivers.
Many of the softpipe shaders I've looked at end up being significantly
shorter.  Some potentially relevant changes to TGSI consumers:

- All immediates are now UINT typed.  This means they're less legible
  in printouts, but means that they get deduplicated better (no more
  multiple copies of 0x0!)
- Sampler views are not currently declared.
- nir_registers don't have their live ranges tracked, so TGSI temp usage
  may go up with a lot of control flow.
- nir_lower_vec_to_mov naively inserts movs instead of trying to coalesce
  the movs with the generators of the ssa values, sometimes increasing
  instruction count.

Acked-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3395>
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index f8ef382..9555a65 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -315,6 +315,7 @@
 NIR_SOURCES := \
 	nir/tgsi_to_nir.c \
 	nir/tgsi_to_nir.h \
+	nir/nir_to_tgsi.c \
 	nir/nir_draw_helpers.c \
 	nir/nir_draw_helpers.h
 
diff --git a/src/gallium/auxiliary/meson.build b/src/gallium/auxiliary/meson.build
index 37a8e01..b3032a7 100644
--- a/src/gallium/auxiliary/meson.build
+++ b/src/gallium/auxiliary/meson.build
@@ -329,6 +329,8 @@
   'util/u_viewport.h',
   'nir/tgsi_to_nir.c',
   'nir/tgsi_to_nir.h',
+  'nir/nir_to_tgsi.c',
+  'nir/nir_to_tgsi.h',
   'nir/nir_draw_helpers.c',
   'nir/nir_draw_helpers.h',
 )
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
new file mode 100644
index 0000000..ce4f5e7
--- /dev/null
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -0,0 +1,2647 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_deref.h"
+#include "nir/nir_to_tgsi.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_from_mesa.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_ureg.h"
+#include "util/debug.h"
+
+struct ntt_compile {
+   nir_shader *s;
+   nir_function_impl *impl;
+   struct pipe_screen *screen;
+   struct ureg_program *ureg;
+
+   bool needs_texcoord_semantic;
+   bool any_reg_as_address;
+   bool native_integers;
+
+   int next_addr_reg;
+   bool addr_declared[2];
+   struct ureg_dst addr_reg[2];
+
+   unsigned loop_label;
+
+   /* TGSI temps for our NIR SSA and register values. */
+   struct ureg_dst *reg_temp;
+   struct ureg_dst *ssa_temp;
+
+   nir_instr_liveness *liveness;
+
+   /* Mappings from driver_location to TGSI input/output number.
+    *
+    * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
+    * their numbers assigned incrementally, unlike inputs or constants.
+    */
+   struct ureg_src *input_index_map;
+   uint64_t centroid_inputs;
+
+   struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
+};
+
+static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
+
+static unsigned
+ntt_64bit_write_mask(unsigned write_mask)
+{
+   return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
+}
+
+static struct ureg_src
+ntt_64bit_1f(struct ntt_compile *c)
+{
+   return ureg_imm4u(c->ureg,
+                     0x00000000, 0x3ff00000,
+                     0x00000000, 0x3ff00000);
+}
+
+static const struct glsl_type *
+ntt_shader_input_type(struct ntt_compile *c,
+                      struct nir_variable *var)
+{
+   switch (c->s->info.stage) {
+   case MESA_SHADER_GEOMETRY:
+   case MESA_SHADER_TESS_EVAL:
+   case MESA_SHADER_TESS_CTRL:
+      if (glsl_type_is_array(var->type))
+         return glsl_get_array_element(var->type);
+      else
+         return var->type;
+   default:
+      return var->type;
+   }
+}
+
+static void
+ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
+                            unsigned *semantic_name, unsigned *semantic_index)
+{
+   /* We want to use most of tgsi_get_gl_varying_semantic(), but the
+    * !texcoord shifting has already been applied, so avoid that.
+    */
+   if (!c->needs_texcoord_semantic &&
+       (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
+      *semantic_name = TGSI_SEMANTIC_GENERIC;
+      *semantic_index = location - VARYING_SLOT_VAR0;
+      return;
+   }
+
+   tgsi_get_gl_varying_semantic(location, true,
+                                semantic_name, semantic_index);
+}
+
+/* TGSI varying declarations have a component usage mask associated (used by
+ * r600 and svga).
+ */
+static uint32_t
+ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
+                    bool is_64)
+{
+   uint32_t usage_mask =
+      u_bit_consecutive(start_component, num_components);
+
+   if (is_64) {
+      if (start_component >= 2)
+         usage_mask >>= 2;
+
+      uint32_t tgsi_usage_mask = 0;
+
+      if (usage_mask & TGSI_WRITEMASK_X)
+         tgsi_usage_mask |= TGSI_WRITEMASK_XY;
+      if (usage_mask & TGSI_WRITEMASK_Y)
+         tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
+
+      return tgsi_usage_mask;
+   } else {
+      return usage_mask;
+   }
+}
+
+/* TGSI varying declarations have a component usage mask associated (used by
+ * r600 and svga).
+ */
+static uint32_t
+ntt_tgsi_var_usage_mask(const struct nir_variable *var)
+{
+   const struct glsl_type *type_without_array =
+      glsl_without_array(var->type);
+   unsigned num_components = glsl_get_vector_elements(type_without_array);
+   if (num_components == 0) /* structs */
+      num_components = 4;
+
+   return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
+                              glsl_type_is_64bit(type_without_array));
+}
+
+static void
+ntt_setup_inputs(struct ntt_compile *c)
+{
+   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+      return;
+
+   unsigned num_inputs = 0;
+   int num_input_arrays = 0;
+
+   nir_foreach_shader_in_variable(var, c->s) {
+      const struct glsl_type *type = ntt_shader_input_type(c, var);
+      unsigned array_len =
+         glsl_count_attribute_slots(type, false);
+
+      num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
+   }
+
+   c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
+
+   nir_foreach_shader_in_variable(var, c->s) {
+      const struct glsl_type *type = ntt_shader_input_type(c, var);
+      unsigned array_len =
+         glsl_count_attribute_slots(type, false);
+
+      unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
+      unsigned sample_loc;
+      struct ureg_src decl;
+
+      if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+         interpolation =
+            tgsi_get_interp_mode(var->data.interpolation,
+                                 var->data.location == VARYING_SLOT_COL0 ||
+                                 var->data.location == VARYING_SLOT_COL1);
+
+         if (var->data.location == VARYING_SLOT_POS)
+            interpolation = TGSI_INTERPOLATE_LINEAR;
+      }
+
+      unsigned semantic_name, semantic_index;
+      ntt_get_gl_varying_semantic(c, var->data.location,
+                                  &semantic_name, &semantic_index);
+
+      if (var->data.sample) {
+         sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
+      } else if (var->data.centroid) {
+         sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
+         c->centroid_inputs |= (BITSET_MASK(array_len) <<
+                                var->data.driver_location);
+      } else {
+         sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
+      }
+
+      unsigned array_id = 0;
+      if (glsl_type_is_array(type))
+         array_id = ++num_input_arrays;
+
+      uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
+
+      decl = ureg_DECL_fs_input_cyl_centroid_layout(c->ureg,
+                                                    semantic_name,
+                                                    semantic_index,
+                                                    interpolation,
+                                                    0,
+                                                    sample_loc,
+                                                    var->data.driver_location,
+                                                    usage_mask,
+                                                    array_id, array_len);
+
+      if (semantic_name == TGSI_SEMANTIC_FACE) {
+         struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
+         /* NIR is ~0 front and 0 back, while TGSI is +1 front */
+         ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));
+         decl = ureg_src(temp);
+      }
+
+      for (unsigned i = 0; i < array_len; i++) {
+         c->input_index_map[var->data.driver_location + i] = decl;
+         c->input_index_map[var->data.driver_location + i].Index += i;
+      }
+   }
+}
+
+static void
+ntt_setup_uniforms(struct ntt_compile *c)
+{
+   struct pipe_screen *screen = c->screen;
+   bool packed = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS);
+
+   nir_foreach_uniform_variable(var, c->s) {
+      if (glsl_type_is_image(var->type)) {
+         c->images[var->data.binding] = ureg_DECL_image(c->ureg,
+                                                        var->data.binding,
+                                                        TGSI_TEXTURE_2D,
+                                                        var->data.image.format,
+                                                        !var->data.read_only,
+                                                        false);
+      } else if (var->data.mode == nir_var_mem_ubo) {
+         ureg_DECL_constant2D(c->ureg, 0, 0, var->data.driver_location + 1);
+      } else {
+         unsigned size;
+         if (packed) {
+            size = DIV_ROUND_UP(glsl_count_dword_slots(var->type,
+                                                       var->data.bindless), 4);
+         } else {
+            size = glsl_count_vec4_slots(var->type, false, var->data.bindless);
+         }
+
+         for (unsigned i = 0; i < size; i++)
+            ureg_DECL_constant(c->ureg, var->data.driver_location + i);
+      }
+   }
+
+   for (int i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      if (c->s->info.textures_used & (1 << i))
+         ureg_DECL_sampler(c->ureg, i);
+   }
+}
+
+static void
+ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
+{
+   foreach_list_typed(nir_register, nir_reg, node, list) {
+      struct ureg_dst decl;
+      if (nir_reg->num_array_elems == 0) {
+         uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
+         if (nir_reg->bit_size == 64) {
+            if (nir_reg->num_components > 2) {
+               fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
+                       nir_reg->num_components, nir_reg->index);
+            }
+
+            write_mask = ntt_64bit_write_mask(write_mask);
+         }
+
+         decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
+      } else {
+         decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
+                                          true);
+      }
+      c->reg_temp[nir_reg->index] = decl;
+   }
+}
+
+static struct ureg_src
+ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
+{
+   uint32_t values[4];
+   int num_components = instr->def.num_components;
+
+   if (instr->def.bit_size == 32) {
+      for (int i = 0; i < num_components; i++)
+         values[i] = instr->value[i].u32;
+   } else {
+      assert(num_components <= 2);
+      for (int i = 0; i < num_components; i++) {
+         values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
+         values[i * 2 + 1] = instr->value[i].u64 >> 32;
+      }
+      num_components *= 2;
+   }
+
+   return ureg_DECL_immediate_uint(c->ureg, values, num_components);
+}
+
+static struct ureg_src
+ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)
+{
+   if (c->any_reg_as_address) {
+      /* Make sure we're getting the refcounting right even on any_reg
+       * drivers.
+       */
+      c->next_addr_reg++;
+
+      return ureg_scalar(addr, 0);
+   }
+
+   assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));
+
+   if (!c->addr_declared[c->next_addr_reg]) {
+      c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),
+                                                     TGSI_WRITEMASK_X);
+      c->addr_declared[c->next_addr_reg] = true;
+   }
+
+   ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
+   return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);
+}
+
+static void
+ntt_put_reladdr(struct ntt_compile *c)
+{
+   c->next_addr_reg--;
+   assert(c->next_addr_reg >= 0);
+}
+
+static void
+ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)
+{
+   if (c->any_reg_as_address)
+      return;
+
+   if (dst.Indirect)
+      ntt_put_reladdr(c);
+   if (dst.DimIndirect)
+      ntt_put_reladdr(c);
+}
+
+static struct ureg_src
+ntt_get_src(struct ntt_compile *c, nir_src src)
+{
+   if (src.is_ssa) {
+      if (src.ssa->parent_instr->type == nir_instr_type_load_const)
+         return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
+
+      return ureg_src(c->ssa_temp[src.ssa->index]);
+   } else {
+      nir_register *reg = src.reg.reg;
+      struct ureg_dst reg_temp = c->reg_temp[reg->index];
+      reg_temp.Index += src.reg.base_offset;
+
+      if (src.reg.indirect) {
+         struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
+         return ureg_src_indirect(ureg_src(reg_temp),
+                                  ntt_reladdr(c, offset));
+      } else {
+         return ureg_src(reg_temp);
+      }
+   }
+}
+
+static struct ureg_src
+ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
+{
+   nir_alu_src src = instr->src[i];
+   struct ureg_src usrc = ntt_get_src(c, src.src);
+
+   if (nir_src_bit_size(src.src) == 64) {
+      int chan0 = 0, chan1 = 1;
+      if (nir_op_infos[instr->op].input_sizes[i] == 0) {
+         chan0 = ffs(instr->dest.write_mask) - 1;
+         chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
+         if (chan1 == -1)
+            chan1 = chan0;
+      }
+      usrc = ureg_swizzle(usrc,
+                          src.swizzle[chan0] * 2,
+                          src.swizzle[chan0] * 2 + 1,
+                          src.swizzle[chan1] * 2,
+                          src.swizzle[chan1] * 2 + 1);
+   } else {
+      usrc = ureg_swizzle(usrc,
+                          src.swizzle[0],
+                          src.swizzle[1],
+                          src.swizzle[2],
+                          src.swizzle[3]);
+   }
+
+   if (src.abs)
+      usrc = ureg_abs(usrc);
+   if (src.negate)
+      usrc = ureg_negate(usrc);
+
+   return usrc;
+}
+
+static struct ureg_dst *
+ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
+{
+   struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
+
+   uint32_t writemask = BITSET_MASK(ssa->num_components);
+   if (ssa->bit_size == 64)
+      writemask = ntt_64bit_write_mask(writemask);
+
+   c->ssa_temp[ssa->index] = ureg_writemask(temp, writemask);
+
+   return &c->ssa_temp[ssa->index];
+}
+
+static struct ureg_dst *
+ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
+{
+   if (dest->is_ssa)
+      return ntt_get_ssa_def_decl(c, &dest->ssa);
+   else
+      return &c->reg_temp[dest->reg.reg->index];
+}
+
+static struct ureg_dst
+ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
+{
+   struct ureg_dst dst = *ntt_get_dest_decl(c, dest);
+
+   if (!dest->is_ssa) {
+      dst.Index += dest->reg.base_offset;
+
+      if (dest->reg.indirect) {
+         struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
+         dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));
+      }
+   }
+
+   return dst;
+}
+
+/* For an SSA dest being populated by a constant src, replace the storage with
+ * a copy of the ureg_src.
+ */
+static void
+ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
+{
+   if (!src.Negate && !src.Absolute && !src.Indirect && !src.DimIndirect &&
+       src.SwizzleX == TGSI_SWIZZLE_X &&
+       (src.SwizzleY == TGSI_SWIZZLE_Y || def->num_components < 2) &&
+       (src.SwizzleZ == TGSI_SWIZZLE_Z || def->num_components < 3) &&
+       (src.SwizzleW == TGSI_SWIZZLE_W || def->num_components < 4)) {
+      switch (src.File) {
+      case TGSI_FILE_IMMEDIATE:
+      case TGSI_FILE_INPUT:
+      case TGSI_FILE_CONSTANT:
+      case TGSI_FILE_SYSTEM_VALUE:
+         c->ssa_temp[def->index] = ureg_dst(src);
+         return;
+      }
+   }
+
+   ureg_MOV(c->ureg, *ntt_get_ssa_def_decl(c, def), src);
+}
+
+static void
+ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
+{
+   if (dest->is_ssa)
+      ntt_store_def(c, &dest->ssa, src);
+   else {
+      struct ureg_dst dst = ntt_get_dest(c, dest);
+      ureg_MOV(c->ureg, dst, src);
+   }
+}
+
+static void
+ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
+                struct ureg_dst dst,
+                struct ureg_src src0,
+                struct ureg_src src1)
+{
+   unsigned i;
+   int num_src;
+
+   /* POW is the only 2-operand scalar op. */
+   if (tgsi_op  == TGSI_OPCODE_POW) {
+      num_src = 2;
+   } else {
+      num_src = 1;
+      src1 = src0;
+   }
+
+   for (i = 0; i < 4; i++) {
+      if (dst.WriteMask & (1 << i)) {
+         struct ureg_dst this_dst = dst;
+         struct ureg_src srcs[2] = {
+            ureg_scalar(src0, i),
+            ureg_scalar(src1, i),
+         };
+         this_dst.WriteMask = (1 << i);
+
+         ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);
+      }
+   }
+}
+
+static void
+ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
+{
+   struct ureg_src src[4];
+   struct ureg_dst dst;
+   unsigned i;
+   int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
+   int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
+   int num_srcs = nir_op_infos[instr->op].num_inputs;
+
+   assert(num_srcs <= ARRAY_SIZE(src));
+   for (i = 0; i < num_srcs; i++)
+      src[i] = ntt_get_alu_src(c, instr, i);
+   dst = ntt_get_dest(c, &instr->dest.dest);
+
+   if (instr->dest.saturate)
+      dst.Saturate = true;
+
+   if (dst_64)
+      dst.WriteMask = ntt_64bit_write_mask(instr->dest.write_mask);
+   else
+      dst.WriteMask = instr->dest.write_mask;
+
+   static enum tgsi_opcode op_map[][2] = {
+      [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
+
+      /* fabs/fneg 32-bit are special-cased below. */
+      [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
+      [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
+
+      [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
+      [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
+      [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
+      [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
+      [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
+      [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
+      [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
+      [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
+      [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
+      [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
+
+      [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
+      [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
+      [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
+
+      /* The conversions will have one combination of src and dst bitsize. */
+      [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
+      [nir_op_f2f64] = { TGSI_OPCODE_F2D },
+      [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
+
+      [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
+      [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
+      [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
+      [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
+      [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
+      [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
+      [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
+      [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
+
+      [nir_op_slt] = { TGSI_OPCODE_SLT },
+      [nir_op_sge] = { TGSI_OPCODE_SGE },
+      [nir_op_seq] = { TGSI_OPCODE_SEQ },
+      [nir_op_sne] = { TGSI_OPCODE_SNE },
+
+      [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
+      [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
+      [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
+      [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
+
+      [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
+      [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
+      [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
+      [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
+
+      [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
+      [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
+
+      [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
+      [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
+      [nir_op_fsign] = { TGSI_OPCODE_SSG },
+      [nir_op_isign] = { TGSI_OPCODE_ISSG },
+      [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
+      [nir_op_fddx] = { TGSI_OPCODE_DDX },
+      [nir_op_fddy] = { TGSI_OPCODE_DDY },
+      [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
+      [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
+      [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
+      [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
+      [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
+      [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
+      [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
+      [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
+      [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
+      [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
+      [nir_op_bit_count] = { TGSI_OPCODE_POPC },
+      [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
+      [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
+      [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
+      [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
+      [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
+      [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
+      [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
+      [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
+      [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
+      [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
+      [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
+      [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
+      [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
+      [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
+
+      /* These bitwise ops don't care about 32 vs 64 types, so they have the
+       * same TGSI op.
+       */
+      [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
+      [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
+      [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
+      [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
+
+      [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
+      [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
+      [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
+      [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
+      [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
+      [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
+      [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
+      [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
+   };
+
+   /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
+    * of .xy.  Store to a temp and move it to the real dst.
+    */
+   bool tgsi_64bit_compare = src_64 && !dst_64 &&
+      (num_srcs == 2 ||
+        nir_op_infos[instr->op].output_type == nir_type_bool32) &&
+      (dst.WriteMask != TGSI_WRITEMASK_X);
+
+   /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
+    * channels and will need to get fixed up.
+    */
+   bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
+                                  num_srcs == 1 && !tgsi_64bit_compare &&
+                                  (dst.WriteMask & ~TGSI_WRITEMASK_XY));
+
+   struct ureg_dst real_dst = ureg_dst_undef();
+   if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
+      real_dst = dst;
+      dst = ureg_DECL_temporary(c->ureg);
+   }
+
+   bool table_op64 = src_64;
+   if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
+      /* The normal path for NIR to TGSI ALU op translation */
+      ureg_insn(c->ureg, op_map[instr->op][table_op64],
+                &dst, 1, src, num_srcs, false);
+   } else {
+      /* Special cases for NIR to TGSI ALU op translation. */
+
+      /* TODO: Use something like the ntt_store() path for the MOV calls so we
+       * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
+       */
+
+      switch (instr->op) {
+      case nir_op_u2u64:
+         ureg_AND(c->ureg, dst, ureg_swizzle(src[0],
+                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
+                  ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
+         break;
+
+      case nir_op_i2i32:
+      case nir_op_u2u32:
+         assert(src_64);
+         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
+                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
+                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
+         break;
+
+      case nir_op_fabs:
+         ureg_MOV(c->ureg, dst, ureg_abs(src[0]));
+         break;
+
+      case nir_op_fsat:
+         if (dst_64) {
+            ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));
+            ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
+         } else {
+            ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);
+         }
+         break;
+
+      case nir_op_fneg:
+         ureg_MOV(c->ureg, dst, ureg_negate(src[0]));
+         break;
+
+         /* NOTE: TGSI 32-bit math ops have the old "one source channel
+          * replicated to all dst channels" behavior, while 64 is normal mapping
+          * of src channels to dst.
+          */
+      case nir_op_frcp:
+         assert(!dst_64);
+         ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);
+         break;
+
+      case nir_op_frsq:
+         assert(!dst_64);
+         ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);
+         break;
+
+      case nir_op_fsqrt:
+         assert(!dst_64);
+         ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);
+         break;
+
+      case nir_op_fexp2:
+         assert(!dst_64);
+         ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);
+         break;
+
+      case nir_op_flog2:
+         assert(!dst_64);
+         ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);
+         break;
+
+      case nir_op_b2f32:
+         ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));
+         break;
+
+      case nir_op_b2f64:
+         ureg_AND(c->ureg, dst,
+                  ureg_swizzle(src[0],
+                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
+                  ntt_64bit_1f(c));
+         break;
+
+      case nir_op_f2b32:
+         if (src_64)
+            ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
+         else
+            ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
+         break;
+
+      case nir_op_i2b32:
+         if (src_64) {
+            ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
+         } else
+            ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
+         break;
+
+      case nir_op_b2i32:
+         ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));
+         break;
+
+      case nir_op_b2i64:
+         ureg_AND(c->ureg, dst,
+                  ureg_swizzle(src[0],
+                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
+                  ureg_imm4u(c->ureg, 1, 0, 1, 0));
+         break;
+
+      case nir_op_fsin:
+         ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);
+         break;
+
+      case nir_op_fcos:
+         ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);
+         break;
+
+      case nir_op_fsub:
+         assert(!dst_64);
+         ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));
+         break;
+
+      case nir_op_isub:
+         assert(!dst_64);
+         ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));
+         break;
+
+         /* XXX: carry */
+
+      case nir_op_fmod:
+         unreachable("should be handled by .lower_fmod = true");
+         break;
+
+      case nir_op_fpow:
+         ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
+         break;
+
+      case nir_op_flrp:
+         ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);
+         break;
+
+      case nir_op_pack_64_2x32_split:
+         ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
+                  ureg_swizzle(src[0],
+                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
+         ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),
+                  ureg_swizzle(src[1],
+                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
+         break;
+
+      case nir_op_unpack_64_2x32_split_x:
+         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
+                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
+                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
+         break;
+
+      case nir_op_unpack_64_2x32_split_y:
+         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
+                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
+                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
+         break;
+
+      case nir_op_b32csel:
+         if (nir_src_bit_size(instr->src[1].src) == 64) {
+            ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],
+                                                 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                                                 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
+                      src[1], src[2]);
+         } else {
+            ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);
+         }
+         break;
+
+      case nir_op_fcsel:
+         /* NIR is src0 != 0 ? src1 : src2.
+          * TGSI is src0 < 0 ? src1 : src2.
+          *
+          * However, fcsel so far as I can find only appears on
+          * bools-as-floats (1.0 or 0.0), so we can negate it for the TGSI op.
+          */
+         ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[2], src[1]);
+         break;
+
+         /* It would be nice if we could get this left as scalar in NIR, since
+          * the TGSI op is scalar.
+          */
+      case nir_op_frexp_sig:
+      case nir_op_frexp_exp: {
+         assert(src_64);
+         struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
+
+         for (int chan = 0; chan < 2; chan++) {
+            int wm = 1 << chan;
+
+            if (!(instr->dest.write_mask & wm))
+               continue;
+
+            struct ureg_dst dsts[2] = { temp, temp };
+            if (instr->op == nir_op_frexp_sig) {
+               dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
+            } else {
+               dsts[1] = ureg_writemask(dst, wm);
+            }
+
+            struct ureg_src chan_src = ureg_swizzle(src[0],
+                                                    chan * 2, chan * 2 + 1,
+                                                    chan * 2, chan * 2 + 1);
+
+            ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,
+                      dsts, 2,
+                      &chan_src, 1, false);
+         }
+
+         ureg_release_temporary(c->ureg, temp);
+         break;
+      }
+
+      case nir_op_ldexp:
+         assert(dst_64); /* 32bit handled in table. */
+         ureg_DLDEXP(c->ureg, dst, src[0],
+                     ureg_swizzle(src[1],
+                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
+                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
+         break;
+
+      case nir_op_vec4:
+      case nir_op_vec3:
+      case nir_op_vec2:
+         unreachable("covered by nir_lower_vec_to_movs()");
+
+      default:
+         fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
+         unreachable("Unknown NIR opcode");
+      }
+   }
+
+   /* 64-bit op fixup movs */
+   if (!ureg_dst_is_undef(real_dst)) {
+      if (tgsi_64bit_compare) {
+         ureg_MOV(c->ureg, real_dst,
+                  ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
+      } else {
+         assert(tgsi_64bit_downconvert);
+         uint8_t swizzle[] = {0, 0, 0, 0};
+         uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
+         if (second_bit)
+            swizzle[ffs(second_bit) - 1] = 1;
+         ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),
+                                                  swizzle[0],
+                                                  swizzle[1],
+                                                  swizzle[2],
+                                                  swizzle[3]));
+      }
+      ureg_release_temporary(c->ureg, dst);
+   }
+}
+
+static struct ureg_src
+ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
+                      nir_src src)
+{
+   if (nir_src_is_const(src)) {
+      usrc.Index += nir_src_as_uint(src);
+      return usrc;
+   } else {
+      return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
+   }
+}
+
+static struct ureg_dst
+ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
+                      nir_src src)
+{
+   if (nir_src_is_const(src)) {
+      dst.Index += nir_src_as_uint(src);
+      return dst;
+   } else {
+      return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
+   }
+}
+
+static struct ureg_src
+ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
+                         nir_src src)
+{
+   if (nir_src_is_const(src)) {
+      return ureg_src_dimension(usrc, nir_src_as_uint(src));
+   } else {
+      return ureg_src_dimension_indirect(usrc,
+                                         ntt_reladdr(c, ntt_get_src(c, src)),
+                                         1);
+   }
+}
+
+static void
+ntt_emit_load_uniform(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   struct ureg_src src =
+      ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_CONSTANT,
+                                                 nir_intrinsic_base(instr)),
+                            instr->src[0]);
+   ntt_store(c, &instr->dest, src);
+}
+
+/* Some load operations in NIR will have a fractional offset that we need to
+ * swizzle down before storing to the result register.
+ */
+static struct ureg_src
+ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
+{
+   return ureg_swizzle(src,
+                       frac,
+                       frac + MIN2(num_components - 1, 1),
+                       frac + MIN2(num_components - 1, 2),
+                       frac + MIN2(num_components - 1, 3));
+}
+
+/* PIPE_CAP_LOAD_CONSTBUF */
+static void
+ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   /* XXX: Emit a TGSI_OPCODE_LOAD instr. */
+}
+
+/* !PIPE_CAP_LOAD_CONSTBUF */
+static void
+ntt_emit_load_ubo_vec4(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   int bit_size = nir_dest_bit_size(instr->dest);
+   assert(bit_size == 32 || instr->num_components <= 2);
+
+   struct ureg_src src;
+   if (nir_src_is_const(instr->src[1])) {
+      src = ureg_src_register(TGSI_FILE_CONSTANT,
+                              nir_src_as_uint(instr->src[1]));
+   } else {
+      src = ureg_src_indirect(ureg_src_register(TGSI_FILE_CONSTANT, 0),
+                              ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
+   }
+
+   int start_component = nir_intrinsic_component(instr);
+   if (bit_size == 64)
+      start_component *= 2;
+
+   src = ntt_shift_by_frac(src, start_component,
+                           instr->num_components * bit_size / 32);
+
+   if (nir_src_is_const(instr->src[0])) {
+      src = ureg_src_dimension(src, nir_src_as_uint(instr->src[0]) + 1);
+   } else {
+      struct ureg_src block_index = ntt_get_src(c, instr->src[0]);
+
+      src = ureg_src_dimension_indirect(src, ntt_reladdr(c, block_index), 1);
+   }
+
+   ntt_store(c, &instr->dest, src);
+}
+
+static unsigned
+ntt_get_access_qualifier(nir_intrinsic_instr *instr)
+{
+   enum gl_access_qualifier access = nir_intrinsic_access(instr);
+   unsigned qualifier = 0;
+
+   if (access & ACCESS_COHERENT)
+      qualifier |= TGSI_MEMORY_COHERENT;
+   if (access & ACCESS_VOLATILE)
+      qualifier |= TGSI_MEMORY_VOLATILE;
+   if (access & ACCESS_RESTRICT)
+      qualifier |= TGSI_MEMORY_RESTRICT;
+
+   return qualifier;
+}
+
+static void
+ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
+             nir_variable_mode mode)
+{
+   bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
+                    instr->intrinsic == nir_intrinsic_store_shared);
+   bool is_load = (instr->intrinsic == nir_intrinsic_load_ssbo ||
+                    instr->intrinsic == nir_intrinsic_load_shared);
+   unsigned opcode;
+   struct ureg_src src[4];
+   int num_src = 0;
+   int nir_src;
+
+   struct ureg_src memory;
+   switch (mode) {
+   case nir_var_mem_ssbo:
+      /* XXX: TGSI should have BUFFER declarations for the SSBOs.  Needed for
+       * r600, nv50, llvmpipe.
+       */
+      memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
+                                     instr->src[is_store ? 1 : 0]);
+      nir_src = 1;
+      break;
+   case nir_var_mem_shared:
+      memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
+      nir_src = 0;
+      break;
+   default:
+      unreachable("unknown memory type");
+   }
+
+   if (is_store) {
+      src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */
+      src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
+   } else {
+      src[num_src++] = memory;
+      if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
+         src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
+         if (!is_load)
+            src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
+      }
+   }
+
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_ssbo_atomic_add:
+   case nir_intrinsic_shared_atomic_add:
+      opcode = TGSI_OPCODE_ATOMUADD;
+      break;
+   case nir_intrinsic_ssbo_atomic_fadd:
+   case nir_intrinsic_shared_atomic_fadd:
+      opcode = TGSI_OPCODE_ATOMFADD;
+      break;
+   case nir_intrinsic_ssbo_atomic_imin:
+   case nir_intrinsic_shared_atomic_imin:
+      opcode = TGSI_OPCODE_ATOMIMIN;
+      break;
+   case nir_intrinsic_ssbo_atomic_imax:
+   case nir_intrinsic_shared_atomic_imax:
+      opcode = TGSI_OPCODE_ATOMIMAX;
+      break;
+   case nir_intrinsic_ssbo_atomic_umin:
+   case nir_intrinsic_shared_atomic_umin:
+      opcode = TGSI_OPCODE_ATOMUMIN;
+      break;
+   case nir_intrinsic_ssbo_atomic_umax:
+   case nir_intrinsic_shared_atomic_umax:
+      opcode = TGSI_OPCODE_ATOMUMAX;
+      break;
+   case nir_intrinsic_ssbo_atomic_and:
+   case nir_intrinsic_shared_atomic_and:
+      opcode = TGSI_OPCODE_ATOMAND;
+      break;
+   case nir_intrinsic_ssbo_atomic_or:
+   case nir_intrinsic_shared_atomic_or:
+      opcode = TGSI_OPCODE_ATOMOR;
+      break;
+   case nir_intrinsic_ssbo_atomic_xor:
+   case nir_intrinsic_shared_atomic_xor:
+      opcode = TGSI_OPCODE_ATOMXOR;
+      break;
+   case nir_intrinsic_ssbo_atomic_exchange:
+   case nir_intrinsic_shared_atomic_exchange:
+      opcode = TGSI_OPCODE_ATOMXCHG;
+      break;
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+   case nir_intrinsic_shared_atomic_comp_swap:
+      opcode = TGSI_OPCODE_ATOMCAS;
+      src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
+      break;
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_load_shared:
+      opcode = TGSI_OPCODE_LOAD;
+      break;
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_store_shared:
+      opcode = TGSI_OPCODE_STORE;
+      break;
+   case nir_intrinsic_get_ssbo_size:
+      opcode = TGSI_OPCODE_RESQ;
+      break;
+   default:
+      unreachable("unknown memory op");
+   }
+
+   unsigned qualifier = 0;
+   if (mode == nir_var_mem_ssbo &&
+       instr->intrinsic != nir_intrinsic_get_ssbo_size) {
+      qualifier = ntt_get_access_qualifier(instr);
+   }
+
+   struct ureg_dst dst;
+   if (is_store) {
+      dst = ureg_dst(memory);
+
+      unsigned write_mask = nir_intrinsic_write_mask(instr);
+      if (nir_src_bit_size(instr->src[0]) == 64)
+         write_mask = ntt_64bit_write_mask(write_mask);
+      dst = ureg_writemask(dst, write_mask);
+   } else {
+      dst = ntt_get_dest(c, &instr->dest);
+   }
+
+   ureg_memory_insn(c->ureg, opcode,
+                    &dst, 1,
+                    src, num_src,
+                    qualifier,
+                    TGSI_TEXTURE_BUFFER,
+                    0 /* format: unused */);
+}
+
+static enum tgsi_texture_type
+tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
+{
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_1D:
+      return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
+   case GLSL_SAMPLER_DIM_2D:
+      return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
+   case GLSL_SAMPLER_DIM_3D:
+      return TGSI_TEXTURE_3D;
+   case GLSL_SAMPLER_DIM_CUBE:
+      return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
+   case GLSL_SAMPLER_DIM_RECT:
+      return TGSI_TEXTURE_RECT;
+   case GLSL_SAMPLER_DIM_BUF:
+      return TGSI_TEXTURE_BUFFER;
+   default:
+      unreachable("unknown sampler dim");
+   }
+}
+
+static void
+ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   unsigned op;
+   struct ureg_src srcs[3];
+   int num_src = 0;
+
+   enum tgsi_texture_type target =
+      tgsi_target_from_sampler_dim(nir_intrinsic_image_dim(instr),
+                                   nir_intrinsic_image_array(instr));
+
+   struct ureg_src resource =
+      resource = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
+                                       instr->src[0]);
+
+   struct ureg_dst dst;
+   if (instr->intrinsic == nir_intrinsic_image_store) {
+      dst = ureg_dst(resource);
+   } else {
+      srcs[num_src++] = resource;
+      dst = ntt_get_dest(c, &instr->dest);
+   }
+
+   if (instr->intrinsic != nir_intrinsic_image_size) {
+      srcs[num_src++] = ntt_get_src(c, instr->src[1]); /* coord */
+      /* XXX: src[2] sample index to coord.z (2d) or coord.w (2darray) */
+      if (instr->intrinsic != nir_intrinsic_image_load) {
+         srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
+         if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
+            srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
+      }
+   }
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_image_load:
+      op = TGSI_OPCODE_LOAD;
+      break;
+   case nir_intrinsic_image_store:
+      op = TGSI_OPCODE_STORE;
+      break;
+   case nir_intrinsic_image_size:
+      op = TGSI_OPCODE_RESQ;
+      break;
+   case nir_intrinsic_image_atomic_add:
+      op = TGSI_OPCODE_ATOMUADD;
+      break;
+   case nir_intrinsic_image_atomic_fadd:
+      op = TGSI_OPCODE_ATOMFADD;
+      break;
+   case nir_intrinsic_image_atomic_imin:
+      op = TGSI_OPCODE_ATOMIMIN;
+      break;
+   case nir_intrinsic_image_atomic_umin:
+      op = TGSI_OPCODE_ATOMUMIN;
+      break;
+   case nir_intrinsic_image_atomic_imax:
+      op = TGSI_OPCODE_ATOMIMAX;
+      break;
+   case nir_intrinsic_image_atomic_umax:
+      op = TGSI_OPCODE_ATOMUMAX;
+      break;
+   case nir_intrinsic_image_atomic_and:
+      op = TGSI_OPCODE_ATOMAND;
+      break;
+   case nir_intrinsic_image_atomic_or:
+      op = TGSI_OPCODE_ATOMOR;
+      break;
+   case nir_intrinsic_image_atomic_xor:
+      op = TGSI_OPCODE_ATOMXOR;
+      break;
+   case nir_intrinsic_image_atomic_exchange:
+      op = TGSI_OPCODE_ATOMXCHG;
+      break;
+   case nir_intrinsic_image_atomic_comp_swap:
+      op = TGSI_OPCODE_ATOMCAS;
+      break;
+   default:
+      unreachable("bad op");
+   }
+
+   ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,
+                    ntt_get_access_qualifier(instr),
+                    target,
+                    nir_intrinsic_format(instr));
+}
+
+static void
+ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   uint32_t frac = nir_intrinsic_component(instr);
+   uint32_t num_components = instr->num_components;
+   unsigned base = nir_intrinsic_base(instr);
+   struct ureg_src input;
+   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
+   bool is_64 = nir_dest_bit_size(instr->dest) == 64;
+
+   if (c->s->info.stage == MESA_SHADER_VERTEX) {
+      input = ureg_DECL_vs_input(c->ureg, base);
+      for (int i = 1; i < semantics.num_slots; i++)
+         ureg_DECL_vs_input(c->ureg, base + i);
+   } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
+      unsigned semantic_name, semantic_index;
+      ntt_get_gl_varying_semantic(c, semantics.location,
+                                  &semantic_name, &semantic_index);
+
+      /* XXX: ArrayID is used in r600 gs inputs */
+      uint32_t array_id = 0;
+
+      input = ureg_DECL_input_layout(c->ureg,
+                                     semantic_name,
+                                     semantic_index,
+                                     base,
+                                     ntt_tgsi_usage_mask(frac,
+                                                         instr->num_components,
+                                                         is_64),
+                                     array_id,
+                                     semantics.num_slots);
+   } else {
+      input = c->input_index_map[base];
+   }
+
+   if (is_64)
+      num_components *= 2;
+
+   input = ntt_shift_by_frac(input, frac, num_components);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_input:
+      input = ntt_ureg_src_indirect(c, input, instr->src[0]);
+      ntt_store(c, &instr->dest, input);
+      break;
+
+   case nir_intrinsic_load_per_vertex_input:
+      input = ntt_ureg_src_indirect(c, input, instr->src[1]);
+      input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
+      ntt_store(c, &instr->dest, input);
+      break;
+
+   case nir_intrinsic_load_interpolated_input: {
+      input = ntt_ureg_src_indirect(c, input, instr->src[1]);
+
+      nir_intrinsic_instr *bary_instr =
+         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
+
+      switch (bary_instr->intrinsic) {
+      case nir_intrinsic_load_barycentric_pixel:
+         ntt_store(c, &instr->dest, input);
+         break;
+
+      case nir_intrinsic_load_barycentric_centroid:
+         /* If the input was declared centroid, then there's no need to
+          * emit the extra TGSI interp instruction, we can just read the
+          * input.
+          */
+         if (c->centroid_inputs & (1 << nir_intrinsic_base(instr))) {
+            ntt_store(c, &instr->dest, input);
+         } else {
+            ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),
+                                 input);
+         }
+         break;
+
+      case nir_intrinsic_load_barycentric_at_sample:
+         ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
+                            ureg_imm1u(c->ureg,
+                                       nir_src_as_uint(bary_instr->src[0])));
+         break;
+
+      case nir_intrinsic_load_barycentric_at_offset:
+         /* We stored the offset in the fake "bary" dest. */
+         ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,
+                            ntt_get_src(c, instr->src[0]));
+         break;
+
+      default:
+         unreachable("bad barycentric interp intrinsic\n");
+      }
+      break;
+   }
+
+   default:
+      unreachable("bad load input intrinsic\n");
+   }
+}
+
+static void
+ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   /* TODO: When making an SSA def's storage, we should check if it's only
+    * used as the source of a store_output and point it at our
+    * TGSI_FILE_OUTPUT instead of generating the extra MOV here.
+    */
+   uint32_t base = nir_intrinsic_base(instr);
+   struct ureg_src src = ntt_get_src(c, instr->src[0]);
+   bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
+   struct ureg_dst out;
+   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
+   uint32_t frac = nir_intrinsic_component(instr);
+
+   if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+      if (semantics.location == FRAG_RESULT_COLOR)
+         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
+
+      unsigned semantic_name, semantic_index;
+      tgsi_get_gl_frag_result_semantic(semantics.location,
+                                       &semantic_name, &semantic_index);
+      semantic_index += semantics.dual_source_blend_index;
+
+      out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
+
+      switch (semantics.location) {
+      case FRAG_RESULT_DEPTH:
+         frac = 2; /* z write is the to the .z channel in TGSI */
+         break;
+      case FRAG_RESULT_STENCIL:
+         frac = 1;
+         break;
+      default:
+         break;
+      }
+   } else {
+      unsigned semantic_name, semantic_index;
+
+      ntt_get_gl_varying_semantic(c, semantics.location,
+                                  &semantic_name, &semantic_index);
+
+      uint32_t usage_mask = ntt_tgsi_usage_mask(frac,
+                                                instr->num_components,
+                                                is_64);
+      uint32_t gs_streams = semantics.gs_streams;
+      for (int i = 0; i < 4; i++) {
+         if (!(usage_mask & (1 << i)))
+            gs_streams &= ~(0x3 << 2 * i);
+      }
+
+      /* XXX: array_id is used in svga tess. */
+      unsigned array_id = 0;
+
+      /* This bit is lost in the i/o semantics, but it's unused in in-tree
+       * drivers.
+       */
+      bool invariant = false;
+
+      out = ureg_DECL_output_layout(c->ureg,
+                                    semantic_name, semantic_index,
+                                    gs_streams,
+                                    base,
+                                    usage_mask,
+                                    array_id,
+                                    semantics.num_slots,
+                                    invariant);
+   }
+
+   out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
+
+   unsigned write_mask = nir_intrinsic_write_mask(instr);
+
+   if (is_64) {
+      write_mask = ntt_64bit_write_mask(write_mask);
+      if (frac >= 2)
+         write_mask = write_mask << 2;
+   } else {
+      write_mask = write_mask << frac;
+   }
+
+   uint8_t swizzle[4] = { 0, 0, 0, 0 };
+   for (int i = frac; i <= 4; i++) {
+      if (write_mask & (1 << i))
+         swizzle[i] = i - frac;
+   }
+
+   src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+   out = ureg_writemask(out, write_mask);
+
+   ureg_MOV(c->ureg, out, src);
+   ntt_reladdr_dst_put(c, out);
+}
+
+static void
+ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
+   enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
+   ntt_store(c, &instr->dest, ureg_DECL_system_value(c->ureg, semantic, 0));
+}
+
+static void
+ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_uniform:
+      ntt_emit_load_uniform(c, instr);
+      break;
+
+   case nir_intrinsic_load_ubo:
+      ntt_emit_load_ubo(c, instr);
+      break;
+
+   case nir_intrinsic_load_ubo_vec4:
+      ntt_emit_load_ubo_vec4(c, instr);
+      break;
+
+      /* Vertex */
+   case nir_intrinsic_load_vertex_id:
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_draw_id:
+   case nir_intrinsic_load_invocation_id:
+   case nir_intrinsic_load_frag_coord:
+   case nir_intrinsic_load_point_coord:
+   case nir_intrinsic_load_front_face:
+   case nir_intrinsic_load_sample_id:
+   case nir_intrinsic_load_sample_mask_in:
+   case nir_intrinsic_load_helper_invocation:
+   case nir_intrinsic_load_tess_coord:
+   case nir_intrinsic_load_patch_vertices_in:
+   case nir_intrinsic_load_primitive_id:
+   case nir_intrinsic_load_tess_level_outer:
+   case nir_intrinsic_load_tess_level_inner:
+   case nir_intrinsic_load_local_invocation_id:
+   case nir_intrinsic_load_work_group_id:
+   case nir_intrinsic_load_num_work_groups:
+   case nir_intrinsic_load_local_group_size:
+   case nir_intrinsic_load_subgroup_size:
+   case nir_intrinsic_load_subgroup_invocation:
+   case nir_intrinsic_load_subgroup_eq_mask:
+   case nir_intrinsic_load_subgroup_ge_mask:
+   case nir_intrinsic_load_subgroup_gt_mask:
+   case nir_intrinsic_load_subgroup_lt_mask:
+      ntt_emit_load_sysval(c, instr);
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input:
+   case nir_intrinsic_load_interpolated_input:
+      ntt_emit_load_input(c, instr);
+      break;
+
+   case nir_intrinsic_store_output:
+      ntt_emit_store_output(c, instr);
+      break;
+
+   case nir_intrinsic_discard:
+      ureg_KILL(c->ureg);
+      break;
+
+   case nir_intrinsic_discard_if: {
+      struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
+
+      if (c->native_integers) {
+         struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);
+         ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));
+         ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
+         ureg_release_temporary(c->ureg, temp);
+      } else {
+         /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
+         ureg_KILL_IF(c->ureg, ureg_negate(cond));
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_ssbo_atomic_add:
+   case nir_intrinsic_ssbo_atomic_fadd:
+   case nir_intrinsic_ssbo_atomic_imin:
+   case nir_intrinsic_ssbo_atomic_imax:
+   case nir_intrinsic_ssbo_atomic_umin:
+   case nir_intrinsic_ssbo_atomic_umax:
+   case nir_intrinsic_ssbo_atomic_and:
+   case nir_intrinsic_ssbo_atomic_or:
+   case nir_intrinsic_ssbo_atomic_xor:
+   case nir_intrinsic_ssbo_atomic_exchange:
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+   case nir_intrinsic_get_ssbo_size:
+      ntt_emit_mem(c, instr, nir_var_mem_ssbo);
+      break;
+
+   case nir_intrinsic_load_shared:
+   case nir_intrinsic_store_shared:
+   case nir_intrinsic_shared_atomic_add:
+   case nir_intrinsic_shared_atomic_fadd:
+   case nir_intrinsic_shared_atomic_imin:
+   case nir_intrinsic_shared_atomic_imax:
+   case nir_intrinsic_shared_atomic_umin:
+   case nir_intrinsic_shared_atomic_umax:
+   case nir_intrinsic_shared_atomic_and:
+   case nir_intrinsic_shared_atomic_or:
+   case nir_intrinsic_shared_atomic_xor:
+   case nir_intrinsic_shared_atomic_exchange:
+   case nir_intrinsic_shared_atomic_comp_swap:
+      ntt_emit_mem(c, instr, nir_var_mem_shared);
+      break;
+
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_size:
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_atomic_fadd:
+   case nir_intrinsic_image_atomic_imin:
+   case nir_intrinsic_image_atomic_umin:
+   case nir_intrinsic_image_atomic_imax:
+   case nir_intrinsic_image_atomic_umax:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap:
+      ntt_emit_image_load_store(c, instr);
+      break;
+
+   case nir_intrinsic_control_barrier:
+      ureg_BARRIER(c->ureg);
+      break;
+
+   case nir_intrinsic_memory_barrier:
+      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
+                                      TGSI_MEMBAR_SHADER_BUFFER |
+                                      TGSI_MEMBAR_ATOMIC_BUFFER |
+                                      TGSI_MEMBAR_SHADER_IMAGE |
+                                      TGSI_MEMBAR_SHARED));
+      break;
+
+   case nir_intrinsic_memory_barrier_atomic_counter:
+      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
+      break;
+
+   case nir_intrinsic_memory_barrier_buffer:
+      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
+      break;
+
+   case nir_intrinsic_memory_barrier_image:
+      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
+      break;
+
+   case nir_intrinsic_memory_barrier_shared:
+      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
+      break;
+
+   case nir_intrinsic_group_memory_barrier:
+      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
+                                      TGSI_MEMBAR_SHADER_BUFFER |
+                                      TGSI_MEMBAR_ATOMIC_BUFFER |
+                                      TGSI_MEMBAR_SHADER_IMAGE |
+                                      TGSI_MEMBAR_SHARED |
+                                      TGSI_MEMBAR_THREAD_GROUP));
+      break;
+
+   case nir_intrinsic_end_primitive:
+      ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
+      break;
+
+   case nir_intrinsic_emit_vertex:
+      ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
+      break;
+
+      /* In TGSI we don't actually generate the barycentric coords, and emit
+       * interp intrinsics later.  However, we do need to store the _at_offset
+       * argument so that we can use it at that point.
+       */
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_at_sample:
+      break;
+
+   case nir_intrinsic_load_barycentric_at_offset:
+      ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
+      break;
+
+   default:
+      fprintf(stderr, "Unknown intrinsic: ");
+      nir_print_instr(&instr->instr, stderr);
+      fprintf(stderr, "\n");
+      break;
+   }
+}
+
+struct ntt_tex_operand_state {
+   struct ureg_src srcs[4];
+   unsigned i;
+   unsigned chan;
+   bool is_temp[4];
+};
+
+static void
+ntt_push_tex_arg(struct ntt_compile *c,
+                 nir_tex_instr *instr,
+                 nir_tex_src_type tex_src_type,
+                 struct ntt_tex_operand_state *s)
+{
+   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
+   if (tex_src < 0)
+      return;
+
+   struct ureg_src src = ntt_get_src(c, instr->src[tex_src].src);
+   int num_components = nir_tex_instr_src_size(instr, tex_src);
+
+   /* Find which src in the tex args we'll fit in. */
+   if (s->chan + num_components > 4) {
+      s->chan = 0;
+      s->i++;
+   }
+
+   /* Would need to fix up swizzling up to the writemask channel here. */
+   assert(num_components == 1 || s->chan == 0);
+   if (num_components == 1)
+      src = ureg_scalar(src, 0);
+
+   if (ureg_src_is_undef(s->srcs[s->i])) {
+      /* First emit of a tex operand's components, no need for a mov. */
+      s->srcs[s->i] = src;
+   } else {
+      /* Otherwise, we need to have a temporary for all the components that go
+       * in this operand.
+       */
+      if (!s->is_temp[s->i]) {
+         struct ureg_src prev_src = s->srcs[s->i];
+         s->srcs[s->i] = ureg_src(ureg_DECL_temporary(c->ureg));
+         s->is_temp[s->i] = true;
+
+         ureg_MOV(c->ureg,
+                  ureg_writemask(ureg_dst(s->srcs[s->i]),
+                                 BITFIELD_MASK(s->chan)), prev_src);
+      }
+
+      ureg_MOV(c->ureg,
+               ureg_writemask(ureg_dst(s->srcs[s->i]),
+                              BITFIELD_RANGE(s->chan, num_components)),
+               src);
+   }
+
+   s->chan += num_components;
+}
+
+static void
+ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
+{
+   struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
+   unsigned target;
+   unsigned tex_opcode;
+
+   struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
+   int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
+   if (sampler_src >= 0) {
+      struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
+      sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));
+   }
+
+   switch (instr->op) {
+   case nir_texop_tex:
+      tex_opcode = TGSI_OPCODE_TEX;
+      break;
+   case nir_texop_txf:
+   case nir_texop_txf_ms:
+      /* XXX: Support txf_lz */
+      tex_opcode = TGSI_OPCODE_TXF;
+      break;
+   case nir_texop_txl:
+      tex_opcode = TGSI_OPCODE_TXL;
+      break;
+   case nir_texop_txb:
+      tex_opcode = TGSI_OPCODE_TXB;
+      break;
+   case nir_texop_txd:
+      tex_opcode = TGSI_OPCODE_TXD;
+      break;
+   case nir_texop_txs:
+      tex_opcode = TGSI_OPCODE_TXQ;
+      break;
+   case nir_texop_tg4:
+      tex_opcode = TGSI_OPCODE_TG4;
+      break;
+   case nir_texop_query_levels:
+      tex_opcode = TGSI_OPCODE_TXQ;
+      break;
+   case nir_texop_lod:
+      tex_opcode = TGSI_OPCODE_LODQ;
+      break;
+   case nir_texop_texture_samples:
+      tex_opcode = TGSI_OPCODE_TXQS;
+      break;
+   default:
+      unreachable("unsupported tex op");
+   }
+
+   struct ntt_tex_operand_state s = { .i = 0 };
+   ntt_push_tex_arg(c, instr, nir_tex_src_coord, &s);
+   /* We always have at least two slots for the coordinate, even on 1D. */
+   s.chan = MAX2(s.chan, 2);
+
+   ntt_push_tex_arg(c, instr, nir_tex_src_comparator, &s);
+   s.chan = MAX2(s.chan, 3);
+
+   ntt_push_tex_arg(c, instr, nir_tex_src_bias, &s);
+   ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
+
+   /* End of packed src setup, everything that follows gets its own operand. */
+   if (s.chan)
+      s.i++;
+
+   switch (instr->sampler_dim) {
+   case GLSL_SAMPLER_DIM_1D:
+      if (instr->is_array) {
+         if (instr->is_shadow) {
+            target = TGSI_TEXTURE_SHADOW1D_ARRAY;
+         } else {
+            target = TGSI_TEXTURE_1D_ARRAY;
+         }
+      } else {
+         if (instr->is_shadow) {
+            target = TGSI_TEXTURE_SHADOW1D;
+         } else {
+            target = TGSI_TEXTURE_1D;
+         }
+      }
+      break;
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      if (instr->is_array) {
+         if (instr->is_shadow) {
+            target = TGSI_TEXTURE_SHADOW2D_ARRAY;
+         } else {
+            target = TGSI_TEXTURE_2D_ARRAY;
+         }
+      } else {
+         if (instr->is_shadow) {
+            target = TGSI_TEXTURE_SHADOW2D;
+         } else {
+            target = TGSI_TEXTURE_2D;
+         }
+      }
+      break;
+   case GLSL_SAMPLER_DIM_MS:
+      if (instr->is_array) {
+         target = TGSI_TEXTURE_2D_ARRAY_MSAA;
+      } else {
+         target = TGSI_TEXTURE_2D_ARRAY;
+      }
+      break;
+   case GLSL_SAMPLER_DIM_3D:
+      assert(!instr->is_shadow);
+      target = TGSI_TEXTURE_3D;
+      break;
+   case GLSL_SAMPLER_DIM_RECT:
+      if (instr->is_shadow) {
+         target = TGSI_TEXTURE_SHADOWRECT;
+      } else {
+         target = TGSI_TEXTURE_RECT;
+      }
+      break;
+   case GLSL_SAMPLER_DIM_CUBE:
+      if (instr->is_array) {
+         if (instr->is_shadow) {
+            target = TGSI_TEXTURE_SHADOWCUBE_ARRAY;
+         } else {
+            target = TGSI_TEXTURE_CUBE_ARRAY;
+         }
+      } else {
+         if (instr->is_shadow) {
+            target = TGSI_TEXTURE_SHADOWCUBE;
+         } else {
+            target = TGSI_TEXTURE_CUBE;
+         }
+      }
+      break;
+   case GLSL_SAMPLER_DIM_BUF:
+      target = TGSI_TEXTURE_BUFFER;
+      break;
+   default:
+      fprintf(stderr, "Unknown sampler dimensions: %d\n", instr->sampler_dim);
+      abort();
+   }
+
+   if (s.i > 1) {
+      if (tex_opcode == TGSI_OPCODE_TEX)
+         tex_opcode = TGSI_OPCODE_TEX2;
+      if (tex_opcode == TGSI_OPCODE_TXB)
+         tex_opcode = TGSI_OPCODE_TXB2;
+      if (tex_opcode == TGSI_OPCODE_TXL)
+         tex_opcode = TGSI_OPCODE_TXL2;
+   }
+
+   if (instr->op == nir_texop_txd) {
+      /* Derivs appear in their own src args */
+      int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
+      int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
+      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
+      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
+   }
+
+   if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
+      if (c->screen->get_param(c->screen,
+                               PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
+         sampler = ureg_scalar(sampler, instr->component);
+         s.srcs[s.i++] = ureg_src_undef();
+      } else {
+         s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
+      }
+   }
+
+   s.srcs[s.i++] = sampler;
+
+   enum tgsi_return_type tex_type;
+   switch (instr->dest_type) {
+   case nir_type_float:
+      tex_type = TGSI_RETURN_TYPE_FLOAT;
+      break;
+   case nir_type_int:
+      tex_type = TGSI_RETURN_TYPE_SINT;
+      break;
+   case nir_type_uint:
+      tex_type = TGSI_RETURN_TYPE_UINT;
+      break;
+   default:
+      unreachable("unknown texture type");
+   }
+
+   struct tgsi_texture_offset tex_offsets[4];
+   unsigned num_tex_offsets = 0;
+   int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
+   if (tex_offset_src >= 0) {
+      struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
+
+      tex_offsets[0].File = offset.File;
+      tex_offsets[0].Index = offset.Index;
+      tex_offsets[0].SwizzleX = offset.SwizzleX;
+      tex_offsets[0].SwizzleY = offset.SwizzleY;
+      tex_offsets[0].SwizzleZ = offset.SwizzleZ;
+      tex_offsets[0].Padding = 0;
+
+      num_tex_offsets = 1;
+   }
+
+   struct ureg_dst tex_dst;
+   if (instr->op == nir_texop_query_levels)
+      tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);
+   else
+      tex_dst = dst;
+
+   ureg_tex_insn(c->ureg, tex_opcode,
+                 &tex_dst, 1,
+                 target,
+                 tex_type,
+                 tex_offsets, num_tex_offsets,
+                 s.srcs, s.i);
+
+   if (instr->op == nir_texop_query_levels) {
+      ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
+      ureg_release_temporary(c->ureg, tex_dst);
+   }
+
+   for (int i = 0; i < s.i; i++) {
+      if (s.is_temp[i])
+         ureg_release_temporary(c->ureg, ureg_dst(s.srcs[i]));
+   }
+}
+
+static void
+ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
+{
+   switch (jump->type) {
+   case nir_jump_break:
+      ureg_BRK(c->ureg);
+      break;
+
+   case nir_jump_continue:
+      ureg_CONT(c->ureg);
+      break;
+
+   default:
+      fprintf(stderr, "Unknown jump instruction: ");
+      nir_print_instr(&jump->instr, stderr);
+      fprintf(stderr, "\n");
+      abort();
+   }
+}
+
+static void
+ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
+{
+   /* Nothing to do but make sure that we have some storage to deref. */
+   (void)ntt_get_ssa_def_decl(c, &instr->def);
+}
+
+static void
+ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
+{
+   /* There is no addr reg in use before we start emitting an instr. */
+   c->next_addr_reg = 0;
+
+   switch (instr->type) {
+   case nir_instr_type_deref:
+      /* ignored, will be walked by nir_intrinsic_image_*_deref. */
+      break;
+
+   case nir_instr_type_alu:
+      ntt_emit_alu(c, nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
+      break;
+
+   case nir_instr_type_load_const:
+      /* Nothing to do here, as load consts are done directly from
+       * ntt_get_src() (since many constant NIR srcs will often get folded
+       * directly into a register file index instead of as a TGSI src).
+       */
+      break;
+
+   case nir_instr_type_tex:
+      ntt_emit_texture(c, nir_instr_as_tex(instr));
+      break;
+
+   case nir_instr_type_jump:
+      ntt_emit_jump(c, nir_instr_as_jump(instr));
+      break;
+
+   case nir_instr_type_ssa_undef:
+      ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
+      break;
+
+   default:
+      fprintf(stderr, "Unknown NIR instr type: ");
+      nir_print_instr(instr, stderr);
+      fprintf(stderr, "\n");
+      abort();
+   }
+}
+
+static void
+ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
+{
+   unsigned label;
+   ureg_UIF(c->ureg, ntt_get_src(c, if_stmt->condition), &label);
+   ntt_emit_cf_list(c, &if_stmt->then_list);
+
+   if (!exec_list_is_empty(&if_stmt->else_list)) {
+      ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
+      ureg_ELSE(c->ureg, &label);
+      ntt_emit_cf_list(c, &if_stmt->else_list);
+   }
+
+   ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
+   ureg_ENDIF(c->ureg);
+}
+
+static void
+ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
+{
+   unsigned last_loop_label = c->loop_label;
+
+   unsigned begin_label;
+   ureg_BGNLOOP(c->ureg, &begin_label);
+   ntt_emit_cf_list(c, &loop->body);
+
+   /* XXX: Need to set cont/break labels for svga, nv30, nv50.
+    *
+    * ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
+    */
+   unsigned end_label;
+   ureg_ENDLOOP(c->ureg, &end_label);
+
+   c->loop_label = last_loop_label;
+}
+
+static void
+ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
+{
+   /* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */
+   if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
+      return;
+
+   ureg_release_temporary(c->ureg, c->ssa_temp[index]);
+   memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
+}
+
+/* Releases any temporaries for SSA defs with a live interval ending at this
+ * instruction.
+ */
+static bool
+ntt_src_live_interval_end_cb(nir_src *src, void *state)
+{
+   struct ntt_compile *c = state;
+
+   if (src->is_ssa) {
+      nir_ssa_def *def = src->ssa;
+
+      if (c->liveness->defs[def->index].end == src->parent_instr->index)
+         ntt_free_ssa_temp_by_index(c, def->index);
+   }
+
+   return true;
+}
+
+static void
+ntt_emit_block(struct ntt_compile *c, nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      ntt_emit_instr(c, instr);
+
+      nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);
+   }
+
+   unsigned index;
+   BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {
+      unsigned def_end_ip = c->liveness->defs[index].end;
+      if (def_end_ip == block->end_ip)
+         ntt_free_ssa_temp_by_index(c, index);
+   }
+}
+
+static void
+ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
+{
+   /* There is no addr reg in use before we start emitting any part of a CF
+    * node (such as an if condition)
+    */
+   c->next_addr_reg = 0;
+
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_block:
+         ntt_emit_block(c, nir_cf_node_as_block(node));
+         break;
+
+      case nir_cf_node_if:
+         ntt_emit_if(c, nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         ntt_emit_loop(c, nir_cf_node_as_loop(node));
+         break;
+
+      default:
+         unreachable("unknown CF type");
+      }
+   }
+}
+
+static void
+ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
+{
+   /* reindex values so the numbers are reasonably small despite
+    * optimization having deleted most of them.
+    */
+   nir_index_ssa_defs(impl);
+   nir_index_local_regs(impl);
+
+   nir_index_instrs(impl);
+
+   c->impl = impl;
+   c->liveness = nir_live_ssa_defs_per_instr(impl);
+
+   c->ssa_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
+   c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
+
+   ntt_setup_registers(c, &impl->registers);
+   ntt_emit_cf_list(c, &impl->body);
+}
+
+static int
+type_size(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
+/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
+ * can handle for 64-bit values in TGSI.
+ */
+static bool
+ntt_should_vectorize_instr(const nir_instr *in_a, const nir_instr *in_b,
+                           void *data)
+{
+   if (in_a->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *a = nir_instr_as_alu(in_a);
+   nir_alu_instr *b = nir_instr_as_alu(in_b);
+
+   unsigned a_num_components = a->dest.dest.ssa.num_components;
+   unsigned b_num_components = b->dest.dest.ssa.num_components;
+
+   int src_bit_size = nir_src_bit_size(a->src[0].src);
+   int dst_bit_size = nir_dest_bit_size(a->dest.dest);
+
+   if (src_bit_size == 64 || dst_bit_size == 64) {
+      if (a_num_components + b_num_components > 2)
+         return false;
+   }
+
+   return true;
+}
+
+static bool
+ntt_should_vectorize_io(unsigned align, unsigned bit_size,
+                        unsigned num_components, unsigned high_offset,
+                        nir_intrinsic_instr *low, nir_intrinsic_instr *high)
+{
+   if (bit_size != 32)
+      return false;
+
+   /* Our offset alignment should aways be at least 4 bytes */
+   if (align < 4)
+      return false;
+
+   /* No wrapping off the end of a TGSI reg.  We could do a bit better by
+    * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
+    * need this restriction.
+    */
+   unsigned worst_start_component = align == 4 ? 3 : align / 4;
+   if (worst_start_component + num_components > 4)
+      return false;
+
+   return true;
+}
+
+static nir_variable_mode
+ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
+{
+   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
+   unsigned indirect_mask = 0;
+
+   if (!screen->get_shader_param(screen, pipe_stage,
+                                 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
+      indirect_mask |= nir_var_shader_in;
+   }
+
+   if (!screen->get_shader_param(screen, pipe_stage,
+                                 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
+      indirect_mask |= nir_var_shader_out;
+   }
+
+   if (!screen->get_shader_param(screen, pipe_stage,
+                                 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
+      indirect_mask |= nir_var_function_temp;
+   }
+
+   return indirect_mask;
+}
+
+static void
+ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
+{
+   bool progress;
+   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
+   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
+   unsigned control_flow_depth =
+      screen->get_shader_param(screen, pipe_stage,
+                               PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
+   do {
+      progress = false;
+
+      NIR_PASS_V(s, nir_lower_vars_to_ssa);
+
+      NIR_PASS(progress, s, nir_copy_prop);
+      NIR_PASS(progress, s, nir_opt_algebraic);
+      NIR_PASS(progress, s, nir_opt_remove_phis);
+      NIR_PASS(progress, s, nir_opt_conditional_discard);
+      NIR_PASS(progress, s, nir_opt_dce);
+      NIR_PASS(progress, s, nir_opt_dead_cf);
+      NIR_PASS(progress, s, nir_opt_cse);
+      NIR_PASS(progress, s, nir_opt_find_array_copies);
+      NIR_PASS(progress, s, nir_opt_if, true);
+      NIR_PASS(progress, s, nir_opt_peephole_select,
+               control_flow_depth == 0 ? ~0 : 8, true, true);
+      NIR_PASS(progress, s, nir_opt_algebraic);
+      NIR_PASS(progress, s, nir_opt_constant_folding);
+      NIR_PASS(progress, s, nir_opt_load_store_vectorize, nir_var_mem_ubo,
+               ntt_should_vectorize_io, 0);
+      NIR_PASS(progress, s, nir_opt_shrink_vectors);
+      NIR_PASS(progress, s, nir_opt_trivial_continues);
+      NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
+      NIR_PASS(progress, s, nir_opt_undef);
+      NIR_PASS(progress, s, nir_opt_loop_unroll, no_indirects_mask);
+
+   } while (progress);
+}
+
+/* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
+ * scalarize vec3/vec4s, should probably fix that.
+ */
+static bool
+scalarize_64bit(const nir_instr *instr, const void *data)
+{
+   const nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   return (nir_dest_bit_size(alu->dest.dest) == 64 ||
+           nir_src_bit_size(alu->src[0].src) == 64);
+}
+
+static bool
+nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
+{
+   b->cursor = nir_after_instr(&instr->instr);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_uniform:
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_vec4:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_interpolated_input:
+   case nir_intrinsic_load_per_vertex_input:
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_ssbo:
+      break;
+   default:
+      return false;
+   }
+
+   if (instr->num_components <= 2)
+      return false;
+
+   bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
+   if (has_dest) {
+      if (nir_dest_bit_size(instr->dest) != 64)
+         return false;
+   } else  {
+      if (nir_src_bit_size(instr->src[0]) != 64)
+          return false;
+   }
+
+   nir_intrinsic_instr *first =
+      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
+   nir_intrinsic_instr *second =
+      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_uniform:
+      nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
+      break;
+
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_vec4:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_store_ssbo:
+      break;
+
+   default: {
+      nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
+      semantics.location++;
+      semantics.num_slots--;
+      nir_intrinsic_set_io_semantics(second, semantics);
+
+      nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
+      break;
+   }
+   }
+
+   first->num_components = 2;
+   second->num_components -= 2;
+   if (has_dest) {
+      first->dest.ssa.num_components = 2;
+      second->dest.ssa.num_components -= 2;
+   }
+
+   nir_builder_instr_insert(b, &first->instr);
+   nir_builder_instr_insert(b, &second->instr);
+
+   if (has_dest) {
+      /* Merge the two loads' results back into a vector. */
+      nir_ssa_def *channels[4] = {
+         nir_channel(b, &first->dest.ssa, 0),
+         nir_channel(b, &first->dest.ssa, 1),
+         nir_channel(b, &second->dest.ssa, 0),
+         second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,
+      };
+      nir_ssa_def *new = nir_vec(b, channels, instr->num_components);
+      nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(new));
+   } else {
+      /* Split the src value across the two stores. */
+      b->cursor = nir_before_instr(&instr->instr);
+
+      nir_ssa_def *src0 = instr->src[0].ssa;
+      nir_ssa_def *channels[4] = { 0 };
+      for (int i = 0; i < instr->num_components; i++)
+         channels[i] = nir_channel(b, src0, i);
+
+      nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
+      nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
+
+      nir_instr_rewrite_src(&first->instr, &first->src[0],
+                            nir_src_for_ssa(nir_vec(b, channels, 2)));
+      nir_instr_rewrite_src(&second->instr, &second->src[0],
+                            nir_src_for_ssa(nir_vec(b, &channels[2],
+                                                    second->num_components)));
+   }
+
+   int offset_src = -1;
+   uint32_t offset_amount = 16;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_load_ubo:
+      offset_src = 1;
+      break;
+   case nir_intrinsic_load_ubo_vec4:
+      offset_src = 1;
+      offset_amount = 1;
+      break;
+   case nir_intrinsic_store_ssbo:
+      offset_src = 2;
+      break;
+   default:
+      break;
+   }
+   if (offset_src != -1) {
+      b->cursor = nir_before_instr(&second->instr);
+      nir_ssa_def *second_offset =
+         nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
+      nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
+                            nir_src_for_ssa(second_offset));
+   }
+
+   /* DCE stores we generated with no writemask (nothing else does this
+    * currently).
+    */
+   if (!has_dest) {
+      if (nir_intrinsic_write_mask(first) == 0)
+         nir_instr_remove(&first->instr);
+      if (nir_intrinsic_write_mask(second) == 0)
+         nir_instr_remove(&second->instr);
+   }
+
+   nir_instr_remove(&instr->instr);
+
+   return true;
+}
+
+static bool
+nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
+{
+   int num_components = instr->def.num_components;
+
+   if (instr->def.bit_size != 64 || num_components <= 2)
+      return false;
+
+   b->cursor = nir_before_instr(&instr->instr);
+
+   nir_load_const_instr *first =
+      nir_load_const_instr_create(b->shader, 2, 64);
+   nir_load_const_instr *second =
+      nir_load_const_instr_create(b->shader, num_components - 2, 64);
+
+   first->value[0] = instr->value[0];
+   first->value[1] = instr->value[1];
+   second->value[0] = instr->value[2];
+   if (num_components == 4)
+      second->value[1] = instr->value[3];
+
+   nir_builder_instr_insert(b, &first->instr);
+   nir_builder_instr_insert(b, &second->instr);
+
+   nir_ssa_def *channels[4] = {
+      nir_channel(b, &first->def, 0),
+      nir_channel(b, &first->def, 1),
+      nir_channel(b, &second->def, 0),
+      num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
+   };
+   nir_ssa_def *new = nir_vec(b, channels, num_components);
+   nir_ssa_def_rewrite_uses(&instr->def, nir_src_for_ssa(new));
+   nir_instr_remove(&instr->instr);
+
+   return true;
+}
+
+static bool
+nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
+                                      void *data)
+{
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
+
+   case nir_instr_type_intrinsic:
+      return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
+   default:
+      return false;
+   }
+}
+
+static bool
+nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
+{
+   return nir_shader_instructions_pass(s,
+                                       nir_to_tgsi_lower_64bit_to_vec2_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
+
+static void
+ntt_sanity_check_driver_options(struct nir_shader *s)
+{
+   UNUSED const struct nir_shader_compiler_options *options = s->options;
+
+   assert(options->lower_extract_byte);
+   assert(options->lower_extract_word);
+   assert(options->lower_fdph);
+   assert(options->lower_flrp64);
+   assert(options->lower_fmod);
+   assert(options->lower_rotate);
+   assert(options->lower_vector_cmp);
+}
+
+const void *
+nir_to_tgsi(struct nir_shader *s,
+            struct pipe_screen *screen)
+{
+   struct ntt_compile *c;
+   const void *tgsi_tokens;
+   bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);
+   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
+   bool native_integers = screen->get_shader_param(screen,
+                                                   pipe_shader_type_from_mesa(s->info.stage),
+                                                   PIPE_SHADER_CAP_INTEGERS);
+
+   ntt_sanity_check_driver_options(s);
+
+   NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+              type_size, (nir_lower_io_options)0);
+   NIR_PASS_V(s, nir_lower_regs_to_ssa);
+
+   const nir_lower_tex_options lower_tex_options = {
+      /* XXX: We could skip lowering of TXP for TEX with <=3 coord_compoennts.
+       */
+      .lower_txp = ~0,
+   };
+   NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
+
+   /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
+    * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
+    * duplication logic we just make it so that we only see vec2s.
+    */
+   NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
+   NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
+
+   if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
+      NIR_PASS_V(s, nir_lower_ubo_vec4);
+
+   ntt_optimize_nir(s, screen);
+
+   NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
+
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, s, nir_opt_algebraic_late);
+      if (progress) {
+         NIR_PASS_V(s, nir_copy_prop);
+         NIR_PASS_V(s, nir_opt_dce);
+         NIR_PASS_V(s, nir_opt_cse);
+      }
+   } while (progress);
+
+   if (screen->get_shader_param(screen,
+                                pipe_shader_type_from_mesa(s->info.stage),
+                                PIPE_SHADER_CAP_INTEGERS)) {
+      NIR_PASS_V(s, nir_lower_bool_to_int32);
+   } else {
+      NIR_PASS_V(s, nir_lower_int_to_float);
+      NIR_PASS_V(s, nir_lower_bool_to_float);
+   }
+
+   NIR_PASS_V(s, nir_lower_to_source_mods,
+              nir_lower_float_source_mods |
+              nir_lower_int_source_mods); /* no doubles */
+   NIR_PASS_V(s, nir_convert_from_ssa, true);
+   NIR_PASS_V(s, nir_lower_vec_to_movs);
+
+   /* locals_to_regs will leave dead derefs that are good to clean up. */
+   NIR_PASS_V(s, nir_lower_locals_to_regs);
+   NIR_PASS_V(s, nir_opt_dce);
+
+   if (debug) {
+      fprintf(stderr, "NIR before translation to TGSI:\n");
+      nir_print_shader(s, stderr);
+   }
+
+   c = rzalloc(NULL, struct ntt_compile);
+   c->screen = screen;
+
+   c->needs_texcoord_semantic =
+      screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
+   c->any_reg_as_address =
+      screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
+
+   c->s = s;
+   c->native_integers = native_integers;
+   c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
+   ureg_setup_shader_info(c->ureg, &s->info);
+
+   ntt_setup_inputs(c);
+   ntt_setup_uniforms(c);
+
+   if (s->info.stage == MESA_SHADER_FRAGMENT) {
+      /* The draw module's polygon stipple layer doesn't respect the chosen
+       * coordinate mode, so leave it as unspecified unless we're actually
+       * reading the position in the shader already.  See
+       * gl-2.1-polygon-stipple-fs on softpipe.
+       */
+      if ((s->info.inputs_read & VARYING_BIT_POS) ||
+          (s->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD))) {
+         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
+                       s->info.fs.origin_upper_left ?
+                       TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
+                       TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
+
+         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
+                       s->info.fs.pixel_center_integer ?
+                       TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
+                       TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
+      }
+   }
+   /* Emit the main function */
+   nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
+   ntt_emit_impl(c, impl);
+   ureg_END(c->ureg);
+
+   tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
+
+   if (debug) {
+      fprintf(stderr, "TGSI after translation from NIR:\n");
+      tgsi_dump(tgsi_tokens, 0);
+   }
+
+   ureg_destroy(c->ureg);
+
+   ralloc_free(c);
+
+   return tgsi_tokens;
+}
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.h b/src/gallium/auxiliary/nir/nir_to_tgsi.h
new file mode 100644
index 0000000..e4cc044
--- /dev/null
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef NIR_TO_TGSI_H
+#define NIR_TO_TGSI_H
+
+struct nir_shader;
+struct pipe_screen;
+
+const void *nir_to_tgsi(struct nir_shader *s,
+                        struct pipe_screen *screen);
+
+#endif /* NIR_TO_TGSI_H */