| /* |
| Copyright (C) Intel Corp. 2006. All Rights Reserved. |
| Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to |
| develop this 3D driver. |
| |
| Permission is hereby granted, free of charge, to any person obtaining |
| a copy of this software and associated documentation files (the |
| "Software"), to deal in the Software without restriction, including |
| without limitation the rights to use, copy, modify, merge, publish, |
| distribute, sublicense, and/or sell copies of the Software, and to |
| permit persons to whom the Software is furnished to do so, subject to |
| the following conditions: |
| |
| The above copyright notice and this permission notice (including the |
| next paragraph) shall be included in all copies or substantial |
| portions of the Software. |
| |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE |
| LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION |
| OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION |
| WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| |
| **********************************************************************/ |
| /* |
| * Authors: |
| * Keith Whitwell <keith@tungstengraphics.com> |
| */ |
| |
| |
| #include "main/macros.h" |
| #include "brw_context.h" |
| #include "brw_wm.h" |
| |
| static bool |
| can_do_pln(struct intel_context *intel, const struct brw_reg *deltas) |
| { |
| struct brw_context *brw = brw_context(&intel->ctx); |
| |
| if (!brw->has_pln) |
| return false; |
| |
| if (deltas[1].nr != deltas[0].nr + 1) |
| return false; |
| |
| if (intel->gen < 6 && ((deltas[0].nr & 1) != 0)) |
| return false; |
| |
| return true; |
| } |
| |
| /* Return the SrcReg index of the channels that can be immediate float operands |
| * instead of usage of PROGRAM_CONSTANT values through push/pull. |
| */ |
| bool |
| brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg) |
| { |
| int opcode_array[] = { |
| [OPCODE_ADD] = 2, |
| [OPCODE_CMP] = 3, |
| [OPCODE_DP3] = 2, |
| [OPCODE_DP4] = 2, |
| [OPCODE_DPH] = 2, |
| [OPCODE_MAX] = 2, |
| [OPCODE_MIN] = 2, |
| [OPCODE_MOV] = 1, |
| [OPCODE_MUL] = 2, |
| [OPCODE_SEQ] = 2, |
| [OPCODE_SGE] = 2, |
| [OPCODE_SGT] = 2, |
| [OPCODE_SLE] = 2, |
| [OPCODE_SLT] = 2, |
| [OPCODE_SNE] = 2, |
| [OPCODE_SWZ] = 1, |
| [OPCODE_XPD] = 2, |
| }; |
| |
| /* These opcodes get broken down in a way that allow two |
| * args to be immediates. |
| */ |
| if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) { |
| if (arg == 1 || arg == 2) |
| return true; |
| } |
| |
| if (opcode > ARRAY_SIZE(opcode_array)) |
| return false; |
| |
| return arg == opcode_array[opcode] - 1; |
| } |
| |
| /** |
| * Computes the screen-space x,y position of the pixels. |
| * |
| * This will be used by emit_delta_xy() or emit_wpos_xy() for |
| * interpolation of attributes.. |
| * |
| * Payload R0: |
| * |
| * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles, |
| * corresponding to each of the 16 execution channels. |
| * R0.1..8 -- ? |
| * R1.0 -- triangle vertex 0.X |
| * R1.1 -- triangle vertex 0.Y |
| * R1.2 -- tile 0 x,y coords (2 packed uwords) |
| * R1.3 -- tile 1 x,y coords (2 packed uwords) |
| * R1.4 -- tile 2 x,y coords (2 packed uwords) |
| * R1.5 -- tile 3 x,y coords (2 packed uwords) |
| * R1.6 -- ? |
| * R1.7 -- ? |
| * R1.8 -- ? |
| */ |
| void emit_pixel_xy(struct brw_wm_compile *c, |
| const struct brw_reg *dst, |
| GLuint mask) |
| { |
| struct brw_compile *p = &c->func; |
| struct brw_reg r1 = brw_vec1_grf(1, 0); |
| struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW); |
| struct brw_reg dst0_uw, dst1_uw; |
| |
| brw_push_insn_state(p); |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| |
| if (c->dispatch_width == 16) { |
| dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)); |
| dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)); |
| } else { |
| dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW)); |
| dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW)); |
| } |
| |
| /* Calculate pixel centers by adding 1 or 0 to each of the |
| * micro-tile coordinates passed in r1. |
| */ |
| if (mask & WRITEMASK_X) { |
| brw_ADD(p, |
| dst0_uw, |
| stride(suboffset(r1_uw, 4), 2, 4, 0), |
| brw_imm_v(0x10101010)); |
| } |
| |
| if (mask & WRITEMASK_Y) { |
| brw_ADD(p, |
| dst1_uw, |
| stride(suboffset(r1_uw,5), 2, 4, 0), |
| brw_imm_v(0x11001100)); |
| } |
| brw_pop_insn_state(p); |
| } |
| |
| /** |
| * Computes the screen-space x,y distance of the pixels from the start |
| * vertex. |
| * |
| * This will be used in linterp or pinterp with the start vertex value |
| * and the Cx, Cy, and C0 coefficients passed in from the setup engine |
| * to produce interpolated attribute values. |
| */ |
| void emit_delta_xy(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0) |
| { |
| struct intel_context *intel = &p->brw->intel; |
| struct brw_reg r1 = brw_vec1_grf(1, 0); |
| |
| if (mask == 0) |
| return; |
| |
| assert(mask == WRITEMASK_XY); |
| |
| if (intel->gen >= 6) { |
| /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1. |
| Just add them with 0.0 for dst reg.. */ |
| r1 = brw_imm_v(0x00000000); |
| brw_ADD(p, |
| dst[0], |
| retype(arg0[0], BRW_REGISTER_TYPE_UW), |
| r1); |
| brw_ADD(p, |
| dst[1], |
| retype(arg0[1], BRW_REGISTER_TYPE_UW), |
| r1); |
| return; |
| } |
| |
| /* Calc delta X,Y by subtracting origin in r1 from the pixel |
| * centers produced by emit_pixel_xy(). |
| */ |
| brw_ADD(p, |
| dst[0], |
| retype(arg0[0], BRW_REGISTER_TYPE_UW), |
| negate(r1)); |
| brw_ADD(p, |
| dst[1], |
| retype(arg0[1], BRW_REGISTER_TYPE_UW), |
| negate(suboffset(r1,1))); |
| } |
| |
| /** |
| * Computes the pixel offset from the window origin for gl_FragCoord(). |
| */ |
| void emit_wpos_xy(struct brw_wm_compile *c, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W); |
| struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W); |
| |
| if (mask & WRITEMASK_X) { |
| if (intel->gen >= 6) { |
| struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F); |
| brw_MOV(p, delta_x_f, delta_x); |
| delta_x = delta_x_f; |
| } |
| |
| if (c->fp->program.PixelCenterInteger) { |
| /* X' = X */ |
| brw_MOV(p, dst[0], delta_x); |
| } else { |
| /* X' = X + 0.5 */ |
| brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5)); |
| } |
| } |
| |
| if (mask & WRITEMASK_Y) { |
| if (intel->gen >= 6) { |
| struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F); |
| brw_MOV(p, delta_y_f, delta_y); |
| delta_y = delta_y_f; |
| } |
| |
| if (c->fp->program.OriginUpperLeft) { |
| if (c->fp->program.PixelCenterInteger) { |
| /* Y' = Y */ |
| brw_MOV(p, dst[1], delta_y); |
| } else { |
| brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5)); |
| } |
| } else { |
| float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5; |
| |
| /* Y' = (height - 1) - Y + center */ |
| brw_ADD(p, dst[1], negate(delta_y), |
| brw_imm_f(c->key.drawable_height - 1 + center_offset)); |
| } |
| } |
| } |
| |
| |
| void emit_pixel_w(struct brw_wm_compile *c, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *deltas) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| struct brw_reg src; |
| struct brw_reg temp_dst; |
| |
| if (intel->gen >= 6) |
| temp_dst = dst[3]; |
| else |
| temp_dst = brw_message_reg(2); |
| |
| assert(intel->gen < 6); |
| |
| /* Don't need this if all you are doing is interpolating color, for |
| * instance. |
| */ |
| if (mask & WRITEMASK_W) { |
| struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4); |
| |
| /* Calc 1/w - just linterp wpos[3] optimized by putting the |
| * result straight into a message reg. |
| */ |
| if (can_do_pln(intel, deltas)) { |
| brw_PLN(p, temp_dst, interp3, deltas[0]); |
| } else { |
| brw_LINE(p, brw_null_reg(), interp3, deltas[0]); |
| brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]); |
| } |
| |
| /* Calc w */ |
| if (intel->gen >= 6) |
| src = temp_dst; |
| else |
| src = brw_null_reg(); |
| |
| if (c->dispatch_width == 16) { |
| brw_math_16(p, dst[3], |
| BRW_MATH_FUNCTION_INV, |
| 2, src, |
| BRW_MATH_PRECISION_FULL); |
| } else { |
| brw_math(p, dst[3], |
| BRW_MATH_FUNCTION_INV, |
| 2, src, |
| BRW_MATH_DATA_VECTOR, |
| BRW_MATH_PRECISION_FULL); |
| } |
| } |
| } |
| |
| void emit_linterp(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *deltas) |
| { |
| struct intel_context *intel = &p->brw->intel; |
| struct brw_reg interp[4]; |
| GLuint nr = arg0[0].nr; |
| GLuint i; |
| |
| interp[0] = brw_vec1_grf(nr, 0); |
| interp[1] = brw_vec1_grf(nr, 4); |
| interp[2] = brw_vec1_grf(nr+1, 0); |
| interp[3] = brw_vec1_grf(nr+1, 4); |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| if (intel->gen >= 6) { |
| brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0)); |
| } else if (can_do_pln(intel, deltas)) { |
| brw_PLN(p, dst[i], interp[i], deltas[0]); |
| } else { |
| brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); |
| brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); |
| } |
| } |
| } |
| } |
| |
| |
| void emit_pinterp(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *deltas, |
| const struct brw_reg *w) |
| { |
| struct intel_context *intel = &p->brw->intel; |
| struct brw_reg interp[4]; |
| GLuint nr = arg0[0].nr; |
| GLuint i; |
| |
| if (intel->gen >= 6) { |
| emit_linterp(p, dst, mask, arg0, interp); |
| return; |
| } |
| |
| interp[0] = brw_vec1_grf(nr, 0); |
| interp[1] = brw_vec1_grf(nr, 4); |
| interp[2] = brw_vec1_grf(nr+1, 0); |
| interp[3] = brw_vec1_grf(nr+1, 4); |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| if (can_do_pln(intel, deltas)) { |
| brw_PLN(p, dst[i], interp[i], deltas[0]); |
| } else { |
| brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); |
| brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); |
| } |
| } |
| } |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_MUL(p, dst[i], dst[i], w[3]); |
| } |
| } |
| } |
| |
| |
| void emit_cinterp(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0) |
| { |
| struct brw_reg interp[4]; |
| GLuint nr = arg0[0].nr; |
| GLuint i; |
| |
| interp[0] = brw_vec1_grf(nr, 0); |
| interp[1] = brw_vec1_grf(nr, 4); |
| interp[2] = brw_vec1_grf(nr+1, 0); |
| interp[3] = brw_vec1_grf(nr+1, 4); |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */ |
| } |
| } |
| } |
| |
| /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */ |
| void emit_frontfacing(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask) |
| { |
| struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); |
| GLuint i; |
| |
| if (!(mask & WRITEMASK_XYZW)) |
| return; |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_MOV(p, dst[i], brw_imm_f(0.0)); |
| } |
| } |
| |
| /* bit 31 is "primitive is back face", so checking < (1 << 31) gives |
| * us front face |
| */ |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31)); |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_MOV(p, dst[i], brw_imm_f(1.0)); |
| } |
| } |
| brw_set_predicate_control_flag_value(p, 0xff); |
| } |
| |
| /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input |
| * looking like: |
| * |
| * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br |
| * |
| * and we're trying to produce: |
| * |
| * DDX DDY |
| * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) |
| * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) |
| * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) |
| * (ss0.br - ss0.bl) (ss0.tr - ss0.br) |
| * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) |
| * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) |
| * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) |
| * (ss1.br - ss1.bl) (ss1.tr - ss1.br) |
| * |
| * and add another set of two more subspans if in 16-pixel dispatch mode. |
| * |
| * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result |
| * for each pair, and vertstride = 2 jumps us 2 elements after processing a |
| * pair. But for DDY, it's harder, as we want to produce the pairs swizzled |
| * between each other. We could probably do it like ddx and swizzle the right |
| * order later, but bail for now and just produce |
| * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) |
| * |
| * The negate_value boolean is used to negate the d/dy computation for FBOs, |
| * since they place the origin at the upper left instead of the lower left. |
| */ |
| void emit_ddxy(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| bool is_ddx, |
| const struct brw_reg *arg0, |
| bool negate_value) |
| { |
| int i; |
| struct brw_reg src0, src1; |
| |
| if (mask & SATURATE) |
| brw_set_saturate(p, 1); |
| for (i = 0; i < 4; i++ ) { |
| if (mask & (1<<i)) { |
| if (is_ddx) { |
| src0 = brw_reg(arg0[i].file, arg0[i].nr, 1, |
| BRW_REGISTER_TYPE_F, |
| BRW_VERTICAL_STRIDE_2, |
| BRW_WIDTH_2, |
| BRW_HORIZONTAL_STRIDE_0, |
| BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
| src1 = brw_reg(arg0[i].file, arg0[i].nr, 0, |
| BRW_REGISTER_TYPE_F, |
| BRW_VERTICAL_STRIDE_2, |
| BRW_WIDTH_2, |
| BRW_HORIZONTAL_STRIDE_0, |
| BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
| } else { |
| src0 = brw_reg(arg0[i].file, arg0[i].nr, 0, |
| BRW_REGISTER_TYPE_F, |
| BRW_VERTICAL_STRIDE_4, |
| BRW_WIDTH_4, |
| BRW_HORIZONTAL_STRIDE_0, |
| BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
| src1 = brw_reg(arg0[i].file, arg0[i].nr, 2, |
| BRW_REGISTER_TYPE_F, |
| BRW_VERTICAL_STRIDE_4, |
| BRW_WIDTH_4, |
| BRW_HORIZONTAL_STRIDE_0, |
| BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); |
| } |
| if (negate_value) |
| brw_ADD(p, dst[i], src1, negate(src0)); |
| else |
| brw_ADD(p, dst[i], src0, negate(src1)); |
| } |
| } |
| if (mask & SATURATE) |
| brw_set_saturate(p, 0); |
| } |
| |
| void emit_alu1(struct brw_compile *p, |
| struct brw_instruction *(*func)(struct brw_compile *, |
| struct brw_reg, |
| struct brw_reg), |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0) |
| { |
| GLuint i; |
| |
| if (mask & SATURATE) |
| brw_set_saturate(p, 1); |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| func(p, dst[i], arg0[i]); |
| } |
| } |
| |
| if (mask & SATURATE) |
| brw_set_saturate(p, 0); |
| } |
| |
| |
| void emit_alu2(struct brw_compile *p, |
| struct brw_instruction *(*func)(struct brw_compile *, |
| struct brw_reg, |
| struct brw_reg, |
| struct brw_reg), |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| GLuint i; |
| |
| if (mask & SATURATE) |
| brw_set_saturate(p, 1); |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| func(p, dst[i], arg0[i], arg1[i]); |
| } |
| } |
| |
| if (mask & SATURATE) |
| brw_set_saturate(p, 0); |
| } |
| |
| |
| void emit_mad(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1, |
| const struct brw_reg *arg2) |
| { |
| GLuint i; |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_MUL(p, dst[i], arg0[i], arg1[i]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_ADD(p, dst[i], dst[i], arg2[i]); |
| brw_set_saturate(p, 0); |
| } |
| } |
| } |
| |
| void emit_lrp(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1, |
| const struct brw_reg *arg2) |
| { |
| GLuint i; |
| |
| /* Uses dst as a temporary: |
| */ |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| /* Can I use the LINE instruction for this? |
| */ |
| brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0)); |
| brw_MUL(p, brw_null_reg(), dst[i], arg2[i]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_MAC(p, dst[i], arg0[i], arg1[i]); |
| brw_set_saturate(p, 0); |
| } |
| } |
| } |
| |
| void emit_sop(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| GLuint cond, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| GLuint i; |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_push_insn_state(p); |
| brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]); |
| brw_set_predicate_control(p, BRW_PREDICATE_NONE); |
| brw_MOV(p, dst[i], brw_imm_f(0)); |
| brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); |
| brw_MOV(p, dst[i], brw_imm_f(1.0)); |
| brw_pop_insn_state(p); |
| } |
| } |
| } |
| |
| static void emit_slt( struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1 ) |
| { |
| emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1); |
| } |
| |
| static void emit_sle( struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1 ) |
| { |
| emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1); |
| } |
| |
| static void emit_sgt( struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1 ) |
| { |
| emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1); |
| } |
| |
| static void emit_sge( struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1 ) |
| { |
| emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1); |
| } |
| |
| static void emit_seq( struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1 ) |
| { |
| emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1); |
| } |
| |
| static void emit_sne( struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1 ) |
| { |
| emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1); |
| } |
| |
| void emit_cmp(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1, |
| const struct brw_reg *arg2) |
| { |
| GLuint i; |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0)); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_SEL(p, dst[i], arg1[i], arg2[i]); |
| brw_set_saturate(p, 0); |
| brw_set_predicate_control_flag_value(p, 0xff); |
| } |
| } |
| } |
| |
| void emit_sign(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0) |
| { |
| GLuint i; |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_MOV(p, dst[i], brw_imm_f(0.0)); |
| |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0)); |
| brw_MOV(p, dst[i], brw_imm_f(-1.0)); |
| brw_set_predicate_control(p, BRW_PREDICATE_NONE); |
| |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0)); |
| brw_MOV(p, dst[i], brw_imm_f(1.0)); |
| brw_set_predicate_control(p, BRW_PREDICATE_NONE); |
| } |
| } |
| } |
| |
| void emit_max(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| GLuint i; |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_SEL(p, dst[i], arg0[i], arg1[i]); |
| brw_set_saturate(p, 0); |
| brw_set_predicate_control_flag_value(p, 0xff); |
| } |
| } |
| } |
| |
| void emit_min(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| GLuint i; |
| |
| for (i = 0; i < 4; i++) { |
| if (mask & (1<<i)) { |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_SEL(p, dst[i], arg0[i], arg1[i]); |
| brw_set_saturate(p, 0); |
| brw_set_predicate_control_flag_value(p, 0xff); |
| } |
| } |
| } |
| |
| |
| void emit_dp2(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; |
| |
| if (!(mask & WRITEMASK_XYZW)) |
| return; /* Do not emit dead code */ |
| |
| assert(is_power_of_two(mask & WRITEMASK_XYZW)); |
| |
| brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]); |
| brw_set_saturate(p, 0); |
| } |
| |
| |
| void emit_dp3(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; |
| |
| if (!(mask & WRITEMASK_XYZW)) |
| return; /* Do not emit dead code */ |
| |
| assert(is_power_of_two(mask & WRITEMASK_XYZW)); |
| |
| brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); |
| brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]); |
| brw_set_saturate(p, 0); |
| } |
| |
| |
| void emit_dp4(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; |
| |
| if (!(mask & WRITEMASK_XYZW)) |
| return; /* Do not emit dead code */ |
| |
| assert(is_power_of_two(mask & WRITEMASK_XYZW)); |
| |
| brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); |
| brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); |
| brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]); |
| brw_set_saturate(p, 0); |
| } |
| |
| |
| void emit_dph(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| const int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; |
| |
| if (!(mask & WRITEMASK_XYZW)) |
| return; /* Do not emit dead code */ |
| |
| assert(is_power_of_two(mask & WRITEMASK_XYZW)); |
| |
| brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]); |
| brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]); |
| brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]); |
| brw_set_saturate(p, 0); |
| } |
| |
| |
| void emit_xpd(struct brw_compile *p, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| GLuint i; |
| |
| assert((mask & WRITEMASK_W) != WRITEMASK_W); |
| |
| for (i = 0 ; i < 3; i++) { |
| if (mask & (1<<i)) { |
| GLuint i2 = (i+2)%3; |
| GLuint i1 = (i+1)%3; |
| |
| brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]); |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_MAC(p, dst[i], arg0[i1], arg1[i2]); |
| brw_set_saturate(p, 0); |
| } |
| } |
| } |
| |
| |
| void emit_math1(struct brw_wm_compile *c, |
| GLuint function, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; |
| struct brw_reg src; |
| |
| if (!(mask & WRITEMASK_XYZW)) |
| return; /* Do not emit dead code */ |
| |
| assert(is_power_of_two(mask & WRITEMASK_XYZW)); |
| |
| if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 || |
| arg0[0].file != BRW_GENERAL_REGISTER_FILE) || |
| arg0[0].negate || arg0[0].abs)) { |
| /* Gen6 math requires that source and dst horizontal stride be 1, |
| * and that the argument be in the GRF. |
| * |
| * The hardware ignores source modifiers (negate and abs) on math |
| * instructions, so we also move to a temp to set those up. |
| */ |
| src = dst[dst_chan]; |
| brw_MOV(p, src, arg0[0]); |
| } else { |
| src = arg0[0]; |
| } |
| |
| /* Send two messages to perform all 16 operations: |
| */ |
| brw_push_insn_state(p); |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_math(p, |
| dst[dst_chan], |
| function, |
| 2, |
| src, |
| BRW_MATH_DATA_VECTOR, |
| BRW_MATH_PRECISION_FULL); |
| |
| if (c->dispatch_width == 16) { |
| brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); |
| brw_math(p, |
| offset(dst[dst_chan],1), |
| function, |
| 3, |
| sechalf(src), |
| BRW_MATH_DATA_VECTOR, |
| BRW_MATH_PRECISION_FULL); |
| } |
| brw_pop_insn_state(p); |
| } |
| |
| |
| void emit_math2(struct brw_wm_compile *c, |
| GLuint function, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0, |
| const struct brw_reg *arg1) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1; |
| |
| if (!(mask & WRITEMASK_XYZW)) |
| return; /* Do not emit dead code */ |
| |
| assert(is_power_of_two(mask & WRITEMASK_XYZW)); |
| |
| brw_push_insn_state(p); |
| |
| /* math can only operate on up to a vec8 at a time, so in |
| * dispatch_width==16 we have to do the second half manually. |
| */ |
| if (intel->gen >= 6) { |
| struct brw_reg src0 = arg0[0]; |
| struct brw_reg src1 = arg1[0]; |
| struct brw_reg temp_dst = dst[dst_chan]; |
| |
| if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) { |
| brw_MOV(p, temp_dst, src0); |
| src0 = temp_dst; |
| } |
| |
| if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) { |
| /* This is a heinous hack to get a temporary register for use |
| * in case both arg0 and arg1 are constants. Why you're |
| * doing exponentiation on constant values in the shader, we |
| * don't know. |
| * |
| * max_wm_grf is almost surely less than the maximum GRF, and |
| * gen6 doesn't care about the number of GRFs used in a |
| * shader like pre-gen6 did. |
| */ |
| struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0); |
| brw_MOV(p, temp, src1); |
| src1 = temp; |
| } |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_math2(p, |
| temp_dst, |
| function, |
| src0, |
| src1); |
| if (c->dispatch_width == 16) { |
| brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); |
| brw_math2(p, |
| sechalf(temp_dst), |
| function, |
| sechalf(src0), |
| sechalf(src1)); |
| } |
| } else { |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_MOV(p, brw_message_reg(3), arg1[0]); |
| if (c->dispatch_width == 16) { |
| brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); |
| brw_MOV(p, brw_message_reg(5), sechalf(arg1[0])); |
| } |
| |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_math(p, |
| dst[dst_chan], |
| function, |
| 2, |
| arg0[0], |
| BRW_MATH_DATA_VECTOR, |
| BRW_MATH_PRECISION_FULL); |
| |
| /* Send two messages to perform all 16 operations: |
| */ |
| if (c->dispatch_width == 16) { |
| brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); |
| brw_math(p, |
| offset(dst[dst_chan],1), |
| function, |
| 4, |
| sechalf(arg0[0]), |
| BRW_MATH_DATA_VECTOR, |
| BRW_MATH_PRECISION_FULL); |
| } |
| } |
| brw_pop_insn_state(p); |
| } |
| |
| |
| void emit_tex(struct brw_wm_compile *c, |
| struct brw_reg *dst, |
| GLuint dst_flags, |
| struct brw_reg *arg, |
| struct brw_reg depth_payload, |
| GLuint tex_idx, |
| GLuint sampler, |
| bool shadow) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| struct brw_reg dst_retyped; |
| GLuint cur_mrf = 2, response_length; |
| GLuint i, nr_texcoords; |
| GLuint emit; |
| GLuint msg_type; |
| GLuint mrf_per_channel; |
| GLuint simd_mode; |
| |
| if (c->dispatch_width == 16) { |
| mrf_per_channel = 2; |
| response_length = 8; |
| dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW); |
| simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; |
| } else { |
| mrf_per_channel = 1; |
| response_length = 4; |
| dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW); |
| simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; |
| } |
| |
| /* How many input regs are there? |
| */ |
| switch (tex_idx) { |
| case TEXTURE_1D_INDEX: |
| emit = WRITEMASK_X; |
| nr_texcoords = 1; |
| break; |
| case TEXTURE_2D_INDEX: |
| case TEXTURE_1D_ARRAY_INDEX: |
| case TEXTURE_RECT_INDEX: |
| case TEXTURE_EXTERNAL_INDEX: |
| emit = WRITEMASK_XY; |
| nr_texcoords = 2; |
| break; |
| case TEXTURE_3D_INDEX: |
| case TEXTURE_2D_ARRAY_INDEX: |
| case TEXTURE_CUBE_INDEX: |
| emit = WRITEMASK_XYZ; |
| nr_texcoords = 3; |
| break; |
| default: |
| /* unexpected target */ |
| abort(); |
| } |
| |
| /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */ |
| if (intel->gen < 5 && c->dispatch_width == 8) |
| nr_texcoords = 3; |
| |
| if (shadow) { |
| if (intel->gen < 7) { |
| /* For shadow comparisons, we have to supply u,v,r. */ |
| nr_texcoords = 3; |
| } else { |
| /* On Ivybridge, the shadow comparitor comes first. Just load it. */ |
| brw_MOV(p, brw_message_reg(cur_mrf), arg[2]); |
| cur_mrf += mrf_per_channel; |
| } |
| } |
| |
| /* Emit the texcoords. */ |
| for (i = 0; i < nr_texcoords; i++) { |
| if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) |
| brw_set_saturate(p, true); |
| |
| if (emit & (1<<i)) |
| brw_MOV(p, brw_message_reg(cur_mrf), arg[i]); |
| else |
| brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0)); |
| cur_mrf += mrf_per_channel; |
| |
| brw_set_saturate(p, false); |
| } |
| |
| /* Fill in the shadow comparison reference value. */ |
| if (shadow && intel->gen < 7) { |
| if (intel->gen >= 5) { |
| /* Fill in the cube map array index value. */ |
| brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0)); |
| cur_mrf += mrf_per_channel; |
| } else if (c->dispatch_width == 8) { |
| /* Fill in the LOD bias value. */ |
| brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0)); |
| cur_mrf += mrf_per_channel; |
| } |
| brw_MOV(p, brw_message_reg(cur_mrf), arg[2]); |
| cur_mrf += mrf_per_channel; |
| } |
| |
| if (intel->gen >= 5) { |
| if (shadow) |
| msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; |
| else |
| msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; |
| } else { |
| /* Note that G45 and older determines shadow compare and dispatch width |
| * from message length for most messages. |
| */ |
| if (c->dispatch_width == 16 && shadow) |
| msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; |
| else |
| msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; |
| } |
| |
| brw_SAMPLE(p, |
| dst_retyped, |
| 1, |
| retype(depth_payload, BRW_REGISTER_TYPE_UW), |
| SURF_INDEX_TEXTURE(sampler), |
| sampler, |
| dst_flags & WRITEMASK_XYZW, |
| msg_type, |
| response_length, |
| cur_mrf - 1, |
| 1, |
| simd_mode, |
| BRW_SAMPLER_RETURN_FORMAT_FLOAT32); |
| } |
| |
| |
| void emit_txb(struct brw_wm_compile *c, |
| struct brw_reg *dst, |
| GLuint dst_flags, |
| struct brw_reg *arg, |
| struct brw_reg depth_payload, |
| GLuint tex_idx, |
| GLuint sampler) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| GLuint msgLength; |
| GLuint msg_type; |
| GLuint mrf_per_channel; |
| GLuint response_length; |
| struct brw_reg dst_retyped; |
| |
| /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased |
| * samples, so we'll use the 16-wide instruction, leave the second halves |
| * undefined, and trust the execution mask to keep the undefined pixels |
| * from mattering. |
| */ |
| if (c->dispatch_width == 16 || intel->gen < 5) { |
| if (intel->gen >= 5) |
| msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; |
| else |
| msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; |
| mrf_per_channel = 2; |
| dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW); |
| response_length = 8; |
| } else { |
| msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; |
| mrf_per_channel = 1; |
| dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW); |
| response_length = 4; |
| } |
| |
| /* Shadow ignored for txb. */ |
| switch (tex_idx) { |
| case TEXTURE_1D_INDEX: |
| brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]); |
| brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0)); |
| brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0)); |
| break; |
| case TEXTURE_2D_INDEX: |
| case TEXTURE_RECT_INDEX: |
| case TEXTURE_EXTERNAL_INDEX: |
| brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]); |
| brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]); |
| brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0)); |
| break; |
| case TEXTURE_3D_INDEX: |
| case TEXTURE_CUBE_INDEX: |
| brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]); |
| brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]); |
| brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]); |
| break; |
| default: |
| /* unexpected target */ |
| abort(); |
| } |
| |
| brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]); |
| msgLength = 2 + 4 * mrf_per_channel - 1; |
| |
| brw_SAMPLE(p, |
| dst_retyped, |
| 1, |
| retype(depth_payload, BRW_REGISTER_TYPE_UW), |
| SURF_INDEX_TEXTURE(sampler), |
| sampler, |
| dst_flags & WRITEMASK_XYZW, |
| msg_type, |
| response_length, |
| msgLength, |
| 1, |
| BRW_SAMPLER_SIMD_MODE_SIMD16, |
| BRW_SAMPLER_RETURN_FORMAT_FLOAT32); |
| } |
| |
| |
| static void emit_lit(struct brw_wm_compile *c, |
| const struct brw_reg *dst, |
| GLuint mask, |
| const struct brw_reg *arg0) |
| { |
| struct brw_compile *p = &c->func; |
| |
| assert((mask & WRITEMASK_XW) == 0); |
| |
| if (mask & WRITEMASK_Y) { |
| brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); |
| brw_MOV(p, dst[1], arg0[0]); |
| brw_set_saturate(p, 0); |
| } |
| |
| if (mask & WRITEMASK_Z) { |
| emit_math2(c, BRW_MATH_FUNCTION_POW, |
| &dst[2], |
| WRITEMASK_X | (mask & SATURATE), |
| &arg0[1], |
| &arg0[3]); |
| } |
| |
| /* Ordinarily you'd use an iff statement to skip or shortcircuit |
| * some of the POW calculations above, but 16-wide iff statements |
| * seem to lock c1 hardware, so this is a nasty workaround: |
| */ |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0)); |
| { |
| if (mask & WRITEMASK_Y) |
| brw_MOV(p, dst[1], brw_imm_f(0)); |
| |
| if (mask & WRITEMASK_Z) |
| brw_MOV(p, dst[2], brw_imm_f(0)); |
| } |
| brw_set_predicate_control(p, BRW_PREDICATE_NONE); |
| } |
| |
| |
| /* Kill pixel - set execution mask to zero for those pixels which |
| * fail. |
| */ |
| static void emit_kil( struct brw_wm_compile *c, |
| struct brw_reg *arg0) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| struct brw_reg pixelmask; |
| GLuint i, j; |
| |
| if (intel->gen >= 6) |
| pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); |
| else |
| pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); |
| |
| for (i = 0; i < 4; i++) { |
| /* Check if we've already done the comparison for this reg |
| * -- common when someone does KIL TEMP.wwww. |
| */ |
| for (j = 0; j < i; j++) { |
| if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0) |
| break; |
| } |
| if (j != i) |
| continue; |
| |
| brw_push_insn_state(p); |
| brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0)); |
| brw_set_predicate_control_flag_value(p, 0xff); |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_AND(p, pixelmask, brw_flag_reg(), pixelmask); |
| brw_pop_insn_state(p); |
| } |
| } |
| |
| static void fire_fb_write( struct brw_wm_compile *c, |
| GLuint base_reg, |
| GLuint nr, |
| GLuint target, |
| GLuint eot ) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| uint32_t msg_control; |
| |
| /* Pass through control information: |
| * |
| * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case. |
| */ |
| /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */ |
| if (intel->gen < 6) |
| { |
| brw_push_insn_state(p); |
| brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */ |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_MOV(p, |
| brw_message_reg(base_reg + 1), |
| brw_vec8_grf(1, 0)); |
| brw_pop_insn_state(p); |
| } |
| |
| if (c->dispatch_width == 16) |
| msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; |
| else |
| msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; |
| |
| /* Send framebuffer write message: */ |
| /* send (16) null.0<1>:uw m0 r0.0<8;8,1>:uw 0x85a04000:ud { Align1 EOT } */ |
| brw_fb_WRITE(p, |
| c->dispatch_width, |
| base_reg, |
| retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW), |
| msg_control, |
| target, |
| nr, |
| 0, |
| eot, |
| true); |
| } |
| |
| |
| static void emit_aa( struct brw_wm_compile *c, |
| struct brw_reg *arg1, |
| GLuint reg ) |
| { |
| struct brw_compile *p = &c->func; |
| GLuint comp = c->aa_dest_stencil_reg / 2; |
| GLuint off = c->aa_dest_stencil_reg % 2; |
| struct brw_reg aa = offset(arg1[comp], off); |
| |
| brw_push_insn_state(p); |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */ |
| brw_MOV(p, brw_message_reg(reg), aa); |
| brw_pop_insn_state(p); |
| } |
| |
| |
| /* Post-fragment-program processing. Send the results to the |
| * framebuffer. |
| * \param arg0 the fragment color |
| * \param arg1 the pass-through depth value |
| * \param arg2 the shader-computed depth value |
| */ |
| void emit_fb_write(struct brw_wm_compile *c, |
| struct brw_reg *arg0, |
| struct brw_reg *arg1, |
| struct brw_reg *arg2, |
| GLuint target, |
| GLuint eot) |
| { |
| struct brw_compile *p = &c->func; |
| struct brw_context *brw = p->brw; |
| struct intel_context *intel = &brw->intel; |
| GLuint nr = 2; |
| GLuint channel; |
| |
| /* Reserve a space for AA - may not be needed: |
| */ |
| if (c->aa_dest_stencil_reg) |
| nr += 1; |
| |
| /* I don't really understand how this achieves the color interleave |
| * (ie RGBARGBA) in the result: [Do the saturation here] |
| */ |
| brw_push_insn_state(p); |
| |
| if (c->key.clamp_fragment_color) |
| brw_set_saturate(p, 1); |
| |
| for (channel = 0; channel < 4; channel++) { |
| if (intel->gen >= 6) { |
| /* gen6 SIMD16 single source DP write looks like: |
| * m + 0: r0 |
| * m + 1: r1 |
| * m + 2: g0 |
| * m + 3: g1 |
| * m + 4: b0 |
| * m + 5: b1 |
| * m + 6: a0 |
| * m + 7: a1 |
| */ |
| if (c->dispatch_width == 16) { |
| brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]); |
| } else { |
| brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]); |
| } |
| } else if (c->dispatch_width == 16 && brw->has_compr4) { |
| /* pre-gen6 SIMD16 single source DP write looks like: |
| * m + 0: r0 |
| * m + 1: g0 |
| * m + 2: b0 |
| * m + 3: a0 |
| * m + 4: r1 |
| * m + 5: g1 |
| * m + 6: b1 |
| * m + 7: a1 |
| * |
| * By setting the high bit of the MRF register number, we indicate |
| * that we want COMPR4 mode - instead of doing the usual destination |
| * + 1 for the second half we get destination + 4. |
| */ |
| brw_MOV(p, |
| brw_message_reg(nr + channel + BRW_MRF_COMPR4), |
| arg0[channel]); |
| } else { |
| /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ |
| /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */ |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_MOV(p, |
| brw_message_reg(nr + channel), |
| arg0[channel]); |
| |
| if (c->dispatch_width == 16) { |
| brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); |
| brw_MOV(p, |
| brw_message_reg(nr + channel + 4), |
| sechalf(arg0[channel])); |
| } |
| } |
| } |
| |
| brw_set_saturate(p, 0); |
| |
| /* skip over the regs populated above: |
| */ |
| if (c->dispatch_width == 16) |
| nr += 8; |
| else |
| nr += 4; |
| |
| brw_pop_insn_state(p); |
| |
| if (c->source_depth_to_render_target) |
| { |
| if (c->computes_depth) |
| brw_MOV(p, brw_message_reg(nr), arg2[2]); |
| else |
| brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */ |
| |
| nr += 2; |
| } |
| |
| if (c->dest_depth_reg) |
| { |
| GLuint comp = c->dest_depth_reg / 2; |
| GLuint off = c->dest_depth_reg % 2; |
| |
| if (off != 0) { |
| brw_push_insn_state(p); |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| |
| brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1)); |
| /* 2nd half? */ |
| brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]); |
| brw_pop_insn_state(p); |
| } |
| else { |
| brw_MOV(p, brw_message_reg(nr), arg1[comp]); |
| } |
| nr += 2; |
| } |
| |
| if (intel->gen >= 6) { |
| /* Load the message header. There's no implied move from src0 |
| * to the base mrf on gen6. |
| */ |
| brw_push_insn_state(p); |
| brw_set_mask_control(p, BRW_MASK_DISABLE); |
| brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD), |
| retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); |
| brw_pop_insn_state(p); |
| |
| if (target != 0) { |
| brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, |
| 0, |
| 2), BRW_REGISTER_TYPE_UD), |
| brw_imm_ud(target)); |
| } |
| } |
| |
| if (!c->runtime_check_aads_emit) { |
| if (c->aa_dest_stencil_reg) |
| emit_aa(c, arg1, 2); |
| |
| fire_fb_write(c, 0, nr, target, eot); |
| } |
| else { |
| struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); |
| struct brw_reg ip = brw_ip_reg(); |
| int jmp; |
| |
| brw_set_compression_control(p, BRW_COMPRESSION_NONE); |
| brw_set_conditionalmod(p, BRW_CONDITIONAL_Z); |
| brw_AND(p, |
| v1_null_ud, |
| get_element_ud(brw_vec8_grf(1,0), 6), |
| brw_imm_ud(1<<26)); |
| |
| jmp = brw_JMPI(p, ip, ip, brw_imm_w(0)) - p->store; |
| { |
| emit_aa(c, arg1, 2); |
| fire_fb_write(c, 0, nr, target, eot); |
| /* note - thread killed in subroutine */ |
| } |
| brw_land_fwd_jump(p, jmp); |
| |
| /* ELSE: Shuffle up one register to fill in the hole left for AA: |
| */ |
| fire_fb_write(c, 1, nr-1, target, eot); |
| } |
| } |
| |
| /** |
| * Move a GPR to scratch memory. |
| */ |
| static void emit_spill( struct brw_wm_compile *c, |
| struct brw_reg reg, |
| GLuint slot ) |
| { |
| struct brw_compile *p = &c->func; |
| |
| /* |
| mov (16) m2.0<1>:ud r2.0<8;8,1>:ud { Align1 Compr } |
| */ |
| brw_MOV(p, brw_message_reg(2), reg); |
| |
| /* |
| mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask } |
| send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 } |
| */ |
| brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot); |
| } |
| |
| |
| /** |
| * Load a GPR from scratch memory. |
| */ |
| static void emit_unspill( struct brw_wm_compile *c, |
| struct brw_reg reg, |
| GLuint slot ) |
| { |
| struct brw_compile *p = &c->func; |
| |
| /* Slot 0 is the undef value. |
| */ |
| if (slot == 0) { |
| brw_MOV(p, reg, brw_imm_f(0)); |
| return; |
| } |
| |
| /* |
| mov (1) r0.2<1>:d 0x000000c0:d { Align1 NoMask } |
| send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 } |
| */ |
| |
| brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot); |
| } |
| |
| |
| /** |
| * Retrieve up to 4 GEN4 register pairs for the given wm reg: |
| * Args with unspill_reg != 0 will be loaded from scratch memory. |
| */ |
| static void get_argument_regs( struct brw_wm_compile *c, |
| struct brw_wm_ref *arg[], |
| struct brw_reg *regs ) |
| { |
| GLuint i; |
| |
| for (i = 0; i < 4; i++) { |
| if (arg[i]) { |
| if (arg[i]->unspill_reg) |
| emit_unspill(c, |
| brw_vec8_grf(arg[i]->unspill_reg, 0), |
| arg[i]->value->spill_slot); |
| |
| regs[i] = arg[i]->hw_reg; |
| } |
| else { |
| regs[i] = brw_null_reg(); |
| } |
| } |
| } |
| |
| |
| /** |
| * For values that have a spill_slot!=0, write those regs to scratch memory. |
| */ |
| static void spill_values( struct brw_wm_compile *c, |
| struct brw_wm_value *values, |
| GLuint nr ) |
| { |
| GLuint i; |
| |
| for (i = 0; i < nr; i++) |
| if (values[i].spill_slot) |
| emit_spill(c, values[i].hw_reg, values[i].spill_slot); |
| } |
| |
| |
| /* Emit the fragment program instructions here. |
| */ |
| void brw_wm_emit( struct brw_wm_compile *c ) |
| { |
| struct brw_compile *p = &c->func; |
| struct intel_context *intel = &p->brw->intel; |
| GLuint insn; |
| |
| brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); |
| if (intel->gen >= 6) |
| brw_set_acc_write_control(p, 1); |
| |
| /* Check if any of the payload regs need to be spilled: |
| */ |
| spill_values(c, c->payload.depth, 4); |
| spill_values(c, c->creg, c->nr_creg); |
| spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX); |
| |
| |
| for (insn = 0; insn < c->nr_insns; insn++) { |
| |
| struct brw_wm_instruction *inst = &c->instruction[insn]; |
| struct brw_reg args[3][4], dst[4]; |
| GLuint i, dst_flags; |
| |
| /* Get argument regs: |
| */ |
| for (i = 0; i < 3; i++) |
| get_argument_regs(c, inst->src[i], args[i]); |
| |
| /* Get dest regs: |
| */ |
| for (i = 0; i < 4; i++) |
| if (inst->dst[i]) |
| dst[i] = inst->dst[i]->hw_reg; |
| else |
| dst[i] = brw_null_reg(); |
| |
| /* Flags |
| */ |
| dst_flags = inst->writemask; |
| if (inst->saturate) |
| dst_flags |= SATURATE; |
| |
| switch (inst->opcode) { |
| /* Generated instructions for calculating triangle interpolants: |
| */ |
| case WM_PIXELXY: |
| emit_pixel_xy(c, dst, dst_flags); |
| break; |
| |
| case WM_DELTAXY: |
| emit_delta_xy(p, dst, dst_flags, args[0]); |
| break; |
| |
| case WM_WPOSXY: |
| emit_wpos_xy(c, dst, dst_flags, args[0]); |
| break; |
| |
| case WM_PIXELW: |
| emit_pixel_w(c, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case WM_LINTERP: |
| emit_linterp(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case WM_PINTERP: |
| emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]); |
| break; |
| |
| case WM_CINTERP: |
| emit_cinterp(p, dst, dst_flags, args[0]); |
| break; |
| |
| case WM_FB_WRITE: |
| emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot); |
| break; |
| |
| case WM_FRONTFACING: |
| emit_frontfacing(p, dst, dst_flags); |
| break; |
| |
| /* Straightforward arithmetic: |
| */ |
| case OPCODE_ADD: |
| emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_FRC: |
| emit_alu1(p, brw_FRC, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_FLR: |
| emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_DDX: |
| emit_ddxy(p, dst, dst_flags, true, args[0], false); |
| break; |
| |
| case OPCODE_DDY: |
| /* Make sure fp->program.UsesDFdy flag got set (otherwise there's no |
| * guarantee that c->key.render_to_fbo is set). |
| */ |
| assert(c->fp->program.UsesDFdy); |
| emit_ddxy(p, dst, dst_flags, false, args[0], c->key.render_to_fbo); |
| break; |
| |
| case OPCODE_DP2: |
| emit_dp2(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_DP3: |
| emit_dp3(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_DP4: |
| emit_dp4(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_DPH: |
| emit_dph(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_TRUNC: |
| for (i = 0; i < 4; i++) { |
| if (dst_flags & (1<<i)) { |
| brw_RNDZ(p, dst[i], args[0][i]); |
| } |
| } |
| break; |
| |
| case OPCODE_LRP: |
| emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]); |
| break; |
| |
| case OPCODE_MAD: |
| emit_mad(p, dst, dst_flags, args[0], args[1], args[2]); |
| break; |
| |
| case OPCODE_MOV: |
| case OPCODE_SWZ: |
| emit_alu1(p, brw_MOV, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_MUL: |
| emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_XPD: |
| emit_xpd(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| /* Higher math functions: |
| */ |
| case OPCODE_RCP: |
| emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_RSQ: |
| emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_SIN: |
| emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_COS: |
| emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_EX2: |
| emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_LG2: |
| emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_SCS: |
| /* There is an scs math function, but it would need some |
| * fixup for 16-element execution. |
| */ |
| if (dst_flags & WRITEMASK_X) |
| emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); |
| if (dst_flags & WRITEMASK_Y) |
| emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); |
| break; |
| |
| case OPCODE_POW: |
| emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| /* Comparisons: |
| */ |
| case OPCODE_CMP: |
| emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]); |
| break; |
| |
| case OPCODE_MAX: |
| emit_max(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_MIN: |
| emit_min(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_SLT: |
| emit_slt(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_SLE: |
| emit_sle(p, dst, dst_flags, args[0], args[1]); |
| break; |
| case OPCODE_SGT: |
| emit_sgt(p, dst, dst_flags, args[0], args[1]); |
| break; |
| case OPCODE_SGE: |
| emit_sge(p, dst, dst_flags, args[0], args[1]); |
| break; |
| case OPCODE_SEQ: |
| emit_seq(p, dst, dst_flags, args[0], args[1]); |
| break; |
| case OPCODE_SNE: |
| emit_sne(p, dst, dst_flags, args[0], args[1]); |
| break; |
| |
| case OPCODE_SSG: |
| emit_sign(p, dst, dst_flags, args[0]); |
| break; |
| |
| case OPCODE_LIT: |
| emit_lit(c, dst, dst_flags, args[0]); |
| break; |
| |
| /* Texturing operations: |
| */ |
| case OPCODE_TEX: |
| emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg, |
| inst->tex_idx, inst->tex_unit, |
| inst->tex_shadow); |
| break; |
| |
| case OPCODE_TXB: |
| emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg, |
| inst->tex_idx, inst->tex_unit); |
| break; |
| |
| case OPCODE_KIL: |
| emit_kil(c, args[0]); |
| break; |
| |
| default: |
| printf("Unsupported opcode %i (%s) in fragment shader\n", |
| inst->opcode, inst->opcode < MAX_OPCODE ? |
| _mesa_opcode_string(inst->opcode) : |
| "unknown"); |
| } |
| |
| for (i = 0; i < 4; i++) |
| if (inst->dst[i] && inst->dst[i]->spill_slot) |
| emit_spill(c, |
| inst->dst[i]->hw_reg, |
| inst->dst[i]->spill_slot); |
| } |
| |
| /* Only properly tested on ILK */ |
| if (p->brw->intel.gen == 5) { |
| brw_remove_duplicate_mrf_moves(p); |
| if (c->dispatch_width == 16) |
| brw_remove_grf_to_mrf_moves(p); |
| } |
| |
| if (unlikely(INTEL_DEBUG & DEBUG_WM)) { |
| int i; |
| |
| printf("wm-native:\n"); |
| for (i = 0; i < p->nr_insn; i++) |
| brw_disasm(stdout, &p->store[i], p->brw->intel.gen); |
| printf("\n"); |
| } |
| } |
| |