src/gallium/drivers/nv50/nv50_program.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2010 Christoph Bumiller
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #include "nv50_program.h"
 #include "nv50_context.h"

 #include "codegen/nv50_ir_driver.h"

 static INLINE unsigned
 bitcount4(const uint32_t val)
 {
    static const uint8_t cnt[16]
    = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
    return cnt[val & 0xf];
 }

 static int
 nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
 {
    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
    unsigned i, n, c;

    n = 0;
    for (i = 0; i < info->numInputs; ++i) {
       prog->in[i].id = i;
       prog->in[i].sn = info->in[i].sn;
       prog->in[i].si = info->in[i].si;
       prog->in[i].hw = n;
       prog->in[i].mask = info->in[i].mask;

       prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);

       for (c = 0; c < 4; ++c)
          if (info->in[i].mask & (1 << c))
             info->in[i].slot[c] = n++;
    }
    prog->in_nr = info->numInputs;

    for (i = 0; i < info->numSysVals; ++i) {
       switch (info->sv[i].sn) {
       case TGSI_SEMANTIC_INSTANCEID:
          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
          continue;
       case TGSI_SEMANTIC_VERTEXID:
          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12;
          continue;
       default:
          break;
       }
    }

    /*
     * Corner case: VP has no inputs, but we will still need to submit data to
     * draw it. HW will shout at us and won't draw anything if we don't enable
     * any input, so let's just pretend it's the first one.
     */
    if (prog->vp.attrs[0] == 0 &&
        prog->vp.attrs[1] == 0 &&
        prog->vp.attrs[2] == 0)
       prog->vp.attrs[0] |= 0xf;

    /* VertexID before InstanceID */
    if (info->io.vertexId < info->numSysVals)
       info->sv[info->io.vertexId].slot[0] = n++;
    if (info->io.instanceId < info->numSysVals)
       info->sv[info->io.instanceId].slot[0] = n++;

    n = 0;
    for (i = 0; i < info->numOutputs; ++i) {
       switch (info->out[i].sn) {
       case TGSI_SEMANTIC_PSIZE:
          prog->vp.psiz = i;
          break;
       case TGSI_SEMANTIC_CLIPDIST:
          prog->vp.clpd[info->out[i].si] = n;
          break;
       case TGSI_SEMANTIC_EDGEFLAG:
          prog->vp.edgeflag = i;
          break;
       case TGSI_SEMANTIC_BCOLOR:
          prog->vp.bfc[info->out[i].si] = i;
          break;
       default:
          break;
       }
       prog->out[i].id = i;
       prog->out[i].sn = info->out[i].sn;
       prog->out[i].si = info->out[i].si;
       prog->out[i].hw = n;
       prog->out[i].mask = info->out[i].mask;

       for (c = 0; c < 4; ++c)
          if (info->out[i].mask & (1 << c))
             info->out[i].slot[c] = n++;
    }
    prog->out_nr = info->numOutputs;
    prog->max_out = n;

    if (prog->vp.psiz < info->numOutputs)
       prog->vp.psiz = prog->out[prog->vp.psiz].hw;

    return 0;
 }

 static int
 nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
 {
    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
    unsigned i, n, m, c;
    unsigned nvary;
    unsigned nflat;
    unsigned nintp = 0;

    /* count recorded non-flat inputs */
    for (m = 0, i = 0; i < info->numInputs; ++i) {
       switch (info->in[i].sn) {
       case TGSI_SEMANTIC_POSITION:
       case TGSI_SEMANTIC_FACE:
          continue;
       default:
          m += info->in[i].flat ? 0 : 1;
          break;
       }
    }
    /* careful: id may be != i in info->in[prog->in[i].id] */

    /* Fill prog->in[] so that non-flat inputs are first and
     * kick out special inputs that don't use the RESULT_MAP.
     */
    for (n = 0, i = 0; i < info->numInputs; ++i) {
       if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
          prog->fp.interp |= info->in[i].mask << 24;
          for (c = 0; c < 4; ++c)
             if (info->in[i].mask & (1 << c))
                info->in[i].slot[c] = nintp++;
       } else
       if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
          info->in[i].slot[0] = 255;
       } else {
          unsigned j = info->in[i].flat ? m++ : n++;

          if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
             prog->vp.bfc[info->in[i].si] = j;

          prog->in[j].id = i;
          prog->in[j].mask = info->in[i].mask;
          prog->in[j].sn = info->in[i].sn;
          prog->in[j].si = info->in[i].si;
          prog->in[j].linear = info->in[i].linear;

          prog->in_nr++;
       }
    }
    if (!(prog->fp.interp & (8 << 24))) {
       ++nintp;
       prog->fp.interp |= 8 << 24;
    }

    for (i = 0; i < prog->in_nr; ++i) {
       int j = prog->in[i].id;

       prog->in[i].hw = nintp;
       for (c = 0; c < 4; ++c)
          if (prog->in[i].mask & (1 << c))
             info->in[j].slot[c] = nintp++;
    }
    /* (n == m) if m never increased, i.e. no flat inputs */
    nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
    nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
    nvary = nintp - nflat;

    prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
    prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;

    /* put front/back colors right after HPOS */
    prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
    for (i = 0; i < 2; ++i)
       if (prog->vp.bfc[i] < 0xff)
          prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;

    /* FP outputs */

    if (info->prop.fp.numColourResults > 1)
       prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;

    for (i = 0; i < info->numOutputs; ++i) {
       prog->out[i].id = i;
       prog->out[i].sn = info->out[i].sn;
       prog->out[i].si = info->out[i].si;
       prog->out[i].mask = info->out[i].mask;

       if (i == info->io.fragDepth || i == info->io.sampleMask)
          continue;
       prog->out[i].hw = info->out[i].si * 4;

       for (c = 0; c < 4; ++c)
          info->out[i].slot[c] = prog->out[i].hw + c;

       prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
    }

    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
       info->out[info->io.sampleMask].slot[0] = prog->max_out++;

    if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
       info->out[info->io.fragDepth].slot[2] = prog->max_out++;

    if (!prog->max_out)
       prog->max_out = 4;

    return 0;
 }

 static int
 nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
 {
    switch (info->type) {
    case PIPE_SHADER_VERTEX:
       return nv50_vertprog_assign_slots(info);
    case PIPE_SHADER_GEOMETRY:
       return nv50_vertprog_assign_slots(info);
    case PIPE_SHADER_FRAGMENT:
       return nv50_fragprog_assign_slots(info);
    default:
       return -1;
    }
 }

 static struct nv50_stream_output_state *
 nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
                                   const struct pipe_stream_output_info *pso)
 {
    struct nv50_stream_output_state *so;
    unsigned b, i, c;
    unsigned base[4];

    so = MALLOC_STRUCT(nv50_stream_output_state);
    if (!so)
       return NULL;
    memset(so->map, 0xff, sizeof(so->map));

    for (b = 0; b < 4; ++b)
       so->num_attribs[b] = 0;
    for (i = 0; i < pso->num_outputs; ++i) {
       unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
       b = pso->output[i].output_buffer;
       assert(b < 4);
       so->num_attribs[b] = MAX2(so->num_attribs[b], end);
    }

    so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;

    so->stride[0] = pso->stride[0] * 4;
    base[0] = 0;
    for (b = 1; b < 4; ++b) {
       assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
       so->stride[b] = so->num_attribs[b] * 4;
       if (so->num_attribs[b])
          so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
       base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
    }
    if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
       assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
       so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
    }

    so->map_size = base[3] + so->num_attribs[3];

    for (i = 0; i < pso->num_outputs; ++i) {
       const unsigned s = pso->output[i].start_component;
       const unsigned p = pso->output[i].dst_offset;
       const unsigned r = pso->output[i].register_index;
       b = pso->output[i].output_buffer;

       for (c = 0; c < pso->output[i].num_components; ++c)
          so->map[base[b] + p + c] = info->out[r].slot[s + c];
    }

    return so;
 }

 boolean
 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 {
    struct nv50_ir_prog_info *info;
    int ret;
    const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;

    info = CALLOC_STRUCT(nv50_ir_prog_info);
    if (!info)
       return FALSE;

    info->type = prog->type;
    info->target = chipset;
    info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    info->bin.source = (void *)prog->pipe.tokens;

    info->io.ucpBinding = 15;
    info->io.ucpBase = 0;
    info->io.genUserClip = prog->vp.clpd_nr;

    info->assignSlots = nv50_program_assign_varying_slots;

    prog->vp.bfc[0] = 0xff;
    prog->vp.bfc[1] = 0xff;
    prog->vp.edgeflag = 0xff;
    prog->vp.clpd[0] = map_undef;
    prog->vp.clpd[1] = map_undef;
    prog->vp.psiz = map_undef;
    prog->gp.primid = 0x80;

    info->driverPriv = prog;

 #ifdef DEBUG
    info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
    info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
 #else
    info->optLevel = 3;
 #endif

    ret = nv50_ir_generate_code(info);
    if (ret) {
       NOUVEAU_ERR("shader translation failed: %i\n", ret);
       goto out;
    }
    if (info->bin.syms) /* we don't need them yet */
       FREE(info->bin.syms);

    prog->code = info->bin.code;
    prog->code_size = info->bin.codeSize;
    prog->fixups = info->bin.relocData;
    prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
    prog->tls_space = info->bin.tlsSpace;

    if (prog->type == PIPE_SHADER_FRAGMENT) {
       if (info->prop.fp.writesDepth) {
          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
          prog->fp.flags[1] = 0x11;
       }
       if (info->prop.fp.usesDiscard)
          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
    }

    if (prog->pipe.stream_output.num_outputs)
       prog->so = nv50_program_create_strmout_state(info,
                                                    &prog->pipe.stream_output);

 out:
    FREE(info);
    return !ret;
 }

 boolean
 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
 {
    struct nouveau_heap *heap;
    int ret;
    uint32_t size = align(prog->code_size, 0x40);

    switch (prog->type) {
    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
    default:
       assert(!"invalid program type");
       return FALSE;
    }

    ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
    if (ret) {
       /* Out of space: evict everything to compactify the code segment, hoping
        * the working set is much smaller and drifts slowly. Improve me !
        */
       while (heap->next) {
          struct nv50_program *evict = heap->next->priv;
          if (evict)
             nouveau_heap_free(&evict->mem);
       }
       debug_printf("WARNING: out of code space, evicting all shaders.\n");
    }
    prog->code_base = prog->mem->start;

    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0)
       return FALSE;
    if (ret > 0)
       nv50->state.new_tls_space = TRUE;

    if (prog->fixups)
       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);

    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
                        (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);

    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
    PUSH_DATA (nv50->base.pushbuf, 0);

    return TRUE;
 }

 void
 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
    const struct pipe_shader_state pipe = p->pipe;
    const ubyte type = p->type;

    if (p->mem)
       nouveau_heap_free(&p->mem);

    if (p->code)
       FREE(p->code);

    if (p->fixups)
       FREE(p->fixups);

    if (p->so)
       FREE(p->so);

    memset(p, 0, sizeof(*p));

    p->pipe = pipe;
    p->type = type;
 }
	/*
	* Copyright 2010 Christoph Bumiller
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
	* OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#include "nv50_program.h"
	#include "nv50_context.h"

	#include "codegen/nv50_ir_driver.h"

	static INLINE unsigned
	bitcount4(const uint32_t val)
	{
	static const uint8_t cnt[16]
	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
	return cnt[val & 0xf];
	}

	static int
	nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
	{
	struct nv50_program prog = (struct nv50_program )info->driverPriv;
	unsigned i, n, c;

	n = 0;
	for (i = 0; i < info->numInputs; ++i) {
	prog->in[i].id = i;
	prog->in[i].sn = info->in[i].sn;
	prog->in[i].si = info->in[i].si;
	prog->in[i].hw = n;
	prog->in[i].mask = info->in[i].mask;

	prog->vp.attrs[(4 * i) / 32] \|= info->in[i].mask << ((4 * i) % 32);

	for (c = 0; c < 4; ++c)
	if (info->in[i].mask & (1 << c))
	info->in[i].slot[c] = n++;
	}
	prog->in_nr = info->numInputs;

	for (i = 0; i < info->numSysVals; ++i) {
	switch (info->sv[i].sn) {
	case TGSI_SEMANTIC_INSTANCEID:
	prog->vp.attrs[2] \|= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
	continue;
	case TGSI_SEMANTIC_VERTEXID:
	prog->vp.attrs[2] \|= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
	prog->vp.attrs[2] \|= NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12;
	continue;
	default:
	break;
	}
	}

	/*
	* Corner case: VP has no inputs, but we will still need to submit data to
	* draw it. HW will shout at us and won't draw anything if we don't enable
	* any input, so let's just pretend it's the first one.
	*/
	if (prog->vp.attrs[0] == 0 &&
	prog->vp.attrs[1] == 0 &&
	prog->vp.attrs[2] == 0)
	prog->vp.attrs[0] \|= 0xf;

	/* VertexID before InstanceID */
	if (info->io.vertexId < info->numSysVals)
	info->sv[info->io.vertexId].slot[0] = n++;
	if (info->io.instanceId < info->numSysVals)
	info->sv[info->io.instanceId].slot[0] = n++;

	n = 0;
	for (i = 0; i < info->numOutputs; ++i) {
	switch (info->out[i].sn) {
	case TGSI_SEMANTIC_PSIZE:
	prog->vp.psiz = i;
	break;
	case TGSI_SEMANTIC_CLIPDIST:
	prog->vp.clpd[info->out[i].si] = n;
	break;
	case TGSI_SEMANTIC_EDGEFLAG:
	prog->vp.edgeflag = i;
	break;
	case TGSI_SEMANTIC_BCOLOR:
	prog->vp.bfc[info->out[i].si] = i;
	break;
	default:
	break;
	}
	prog->out[i].id = i;
	prog->out[i].sn = info->out[i].sn;
	prog->out[i].si = info->out[i].si;
	prog->out[i].hw = n;
	prog->out[i].mask = info->out[i].mask;

	for (c = 0; c < 4; ++c)
	if (info->out[i].mask & (1 << c))
	info->out[i].slot[c] = n++;
	}
	prog->out_nr = info->numOutputs;
	prog->max_out = n;

	if (prog->vp.psiz < info->numOutputs)
	prog->vp.psiz = prog->out[prog->vp.psiz].hw;

	return 0;
	}

	static int
	nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
	{
	struct nv50_program prog = (struct nv50_program )info->driverPriv;
	unsigned i, n, m, c;
	unsigned nvary;
	unsigned nflat;
	unsigned nintp = 0;

	/* count recorded non-flat inputs */
	for (m = 0, i = 0; i < info->numInputs; ++i) {
	switch (info->in[i].sn) {
	case TGSI_SEMANTIC_POSITION:
	case TGSI_SEMANTIC_FACE:
	continue;
	default:
	m += info->in[i].flat ? 0 : 1;
	break;
	}
	}
	/* careful: id may be != i in info->in[prog->in[i].id] */

	/* Fill prog->in[] so that non-flat inputs are first and
	* kick out special inputs that don't use the RESULT_MAP.
	*/
	for (n = 0, i = 0; i < info->numInputs; ++i) {
	if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
	prog->fp.interp \|= info->in[i].mask << 24;
	for (c = 0; c < 4; ++c)
	if (info->in[i].mask & (1 << c))
	info->in[i].slot[c] = nintp++;
	} else
	if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
	info->in[i].slot[0] = 255;
	} else {
	unsigned j = info->in[i].flat ? m++ : n++;

	if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
	prog->vp.bfc[info->in[i].si] = j;

	prog->in[j].id = i;
	prog->in[j].mask = info->in[i].mask;
	prog->in[j].sn = info->in[i].sn;
	prog->in[j].si = info->in[i].si;
	prog->in[j].linear = info->in[i].linear;

	prog->in_nr++;
	}
	}
	if (!(prog->fp.interp & (8 << 24))) {
	++nintp;
	prog->fp.interp \|= 8 << 24;
	}

	for (i = 0; i < prog->in_nr; ++i) {
	int j = prog->in[i].id;

	prog->in[i].hw = nintp;
	for (c = 0; c < 4; ++c)
	if (prog->in[i].mask & (1 << c))
	info->in[j].slot[c] = nintp++;
	}
	/* (n == m) if m never increased, i.e. no flat inputs */
	nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
	nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
	nvary = nintp - nflat;

	prog->fp.interp \|= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
	prog->fp.interp \|= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;

	/* put front/back colors right after HPOS */
	prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
	for (i = 0; i < 2; ++i)
	if (prog->vp.bfc[i] < 0xff)
	prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;

	/* FP outputs */

	if (info->prop.fp.numColourResults > 1)
	prog->fp.flags[0] \|= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;

	for (i = 0; i < info->numOutputs; ++i) {
	prog->out[i].id = i;
	prog->out[i].sn = info->out[i].sn;
	prog->out[i].si = info->out[i].si;
	prog->out[i].mask = info->out[i].mask;

	if (i == info->io.fragDepth \|\| i == info->io.sampleMask)
	continue;
	prog->out[i].hw = info->out[i].si * 4;

	for (c = 0; c < 4; ++c)
	info->out[i].slot[c] = prog->out[i].hw + c;

	prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
	}

	if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
	info->out[info->io.sampleMask].slot[0] = prog->max_out++;

	if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
	info->out[info->io.fragDepth].slot[2] = prog->max_out++;

	if (!prog->max_out)
	prog->max_out = 4;

	return 0;
	}

	static int
	nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
	{
	switch (info->type) {
	case PIPE_SHADER_VERTEX:
	return nv50_vertprog_assign_slots(info);
	case PIPE_SHADER_GEOMETRY:
	return nv50_vertprog_assign_slots(info);
	case PIPE_SHADER_FRAGMENT:
	return nv50_fragprog_assign_slots(info);
	default:
	return -1;
	}
	}

	static struct nv50_stream_output_state *
	nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
	const struct pipe_stream_output_info *pso)
	{
	struct nv50_stream_output_state *so;
	unsigned b, i, c;
	unsigned base[4];

	so = MALLOC_STRUCT(nv50_stream_output_state);
	if (!so)
	return NULL;
	memset(so->map, 0xff, sizeof(so->map));

	for (b = 0; b < 4; ++b)
	so->num_attribs[b] = 0;
	for (i = 0; i < pso->num_outputs; ++i) {
	unsigned end = pso->output[i].dst_offset + pso->output[i].num_components;
	b = pso->output[i].output_buffer;
	assert(b < 4);
	so->num_attribs[b] = MAX2(so->num_attribs[b], end);
	}

	so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;

	so->stride[0] = pso->stride[0] * 4;
	base[0] = 0;
	for (b = 1; b < 4; ++b) {
	assert(!so->num_attribs[b] \|\| so->num_attribs[b] == pso->stride[b]);
	so->stride[b] = so->num_attribs[b] * 4;
	if (so->num_attribs[b])
	so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
	base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
	}
	if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
	assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
	so->ctrl \|= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
	}

	so->map_size = base[3] + so->num_attribs[3];

	for (i = 0; i < pso->num_outputs; ++i) {
	const unsigned s = pso->output[i].start_component;
	const unsigned p = pso->output[i].dst_offset;
	const unsigned r = pso->output[i].register_index;
	b = pso->output[i].output_buffer;

	for (c = 0; c < pso->output[i].num_components; ++c)
	so->map[base[b] + p + c] = info->out[r].slot[s + c];
	}

	return so;
	}

	boolean
	nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
	{
	struct nv50_ir_prog_info *info;
	int ret;
	const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;

	info = CALLOC_STRUCT(nv50_ir_prog_info);
	if (!info)
	return FALSE;

	info->type = prog->type;
	info->target = chipset;
	info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
	info->bin.source = (void *)prog->pipe.tokens;

	info->io.ucpBinding = 15;
	info->io.ucpBase = 0;
	info->io.genUserClip = prog->vp.clpd_nr;

	info->assignSlots = nv50_program_assign_varying_slots;

	prog->vp.bfc[0] = 0xff;
	prog->vp.bfc[1] = 0xff;
	prog->vp.edgeflag = 0xff;
	prog->vp.clpd[0] = map_undef;
	prog->vp.clpd[1] = map_undef;
	prog->vp.psiz = map_undef;
	prog->gp.primid = 0x80;

	info->driverPriv = prog;

	#ifdef DEBUG
	info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
	info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
	#else
	info->optLevel = 3;
	#endif

	ret = nv50_ir_generate_code(info);
	if (ret) {
	NOUVEAU_ERR("shader translation failed: %i\n", ret);
	goto out;
	}
	if (info->bin.syms) /* we don't need them yet */
	FREE(info->bin.syms);

	prog->code = info->bin.code;
	prog->code_size = info->bin.codeSize;
	prog->fixups = info->bin.relocData;
	prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
	prog->tls_space = info->bin.tlsSpace;

	if (prog->type == PIPE_SHADER_FRAGMENT) {
	if (info->prop.fp.writesDepth) {
	prog->fp.flags[0] \|= NV50_3D_FP_CONTROL_EXPORTS_Z;
	prog->fp.flags[1] = 0x11;
	}
	if (info->prop.fp.usesDiscard)
	prog->fp.flags[0] \|= NV50_3D_FP_CONTROL_USES_KIL;
	}

	if (prog->pipe.stream_output.num_outputs)
	prog->so = nv50_program_create_strmout_state(info,
	&prog->pipe.stream_output);

	out:
	FREE(info);
	return !ret;
	}

	boolean
	nv50_program_upload_code(struct nv50_context nv50, struct nv50_program prog)
	{
	struct nouveau_heap *heap;
	int ret;
	uint32_t size = align(prog->code_size, 0x40);

	switch (prog->type) {
	case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break;
	case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
	case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
	default:
	assert(!"invalid program type");
	return FALSE;
	}

	ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
	if (ret) {
	/* Out of space: evict everything to compactify the code segment, hoping
	* the working set is much smaller and drifts slowly. Improve me !
	*/
	while (heap->next) {
	struct nv50_program *evict = heap->next->priv;
	if (evict)
	nouveau_heap_free(&evict->mem);
	}
	debug_printf("WARNING: out of code space, evicting all shaders.\n");
	}
	prog->code_base = prog->mem->start;

	ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
	if (ret < 0)
	return FALSE;
	if (ret > 0)
	nv50->state.new_tls_space = TRUE;

	if (prog->fixups)
	nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);

	nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
	(prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
	NOUVEAU_BO_VRAM, prog->code_size, prog->code);

	BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
	PUSH_DATA (nv50->base.pushbuf, 0);

	return TRUE;
	}

	void
	nv50_program_destroy(struct nv50_context nv50, struct nv50_program p)
	{
	const struct pipe_shader_state pipe = p->pipe;
	const ubyte type = p->type;

	if (p->mem)
	nouveau_heap_free(&p->mem);

	if (p->code)
	FREE(p->code);

	if (p->fixups)
	FREE(p->fixups);

	if (p->so)
	FREE(p->so);

	memset(p, 0, sizeof(*p));

	p->pipe = pipe;
	p->type = type;
	}