| /* |
| * Copyright 2011 Christoph Bumiller |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF |
| * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include "nv50/codegen/nv50_ir.h" |
| #include "nv50/codegen/nv50_ir_build_util.h" |
| |
| #include "nv50_ir_target_nv50.h" |
| |
| namespace nv50_ir { |
| |
| // nv50 doesn't support 32 bit integer multiplication |
| // |
| // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl) |
| // ------------------- |
| // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) + |
| // ah*bh 00 00 ( carry1) << 16 + ( carry2) |
| // al*bl |
| // ah*bl 00 |
| // |
| // fffe0001 + fffe0001 |
| static bool |
| expandIntegerMUL(BuildUtil *bld, Instruction *mul) |
| { |
| const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; |
| |
| DataType fTy = mul->sType; // full type |
| DataType hTy; |
| switch (fTy) { |
| case TYPE_S32: hTy = TYPE_S16; break; |
| case TYPE_U32: hTy = TYPE_U16; break; |
| case TYPE_U64: hTy = TYPE_U32; break; |
| case TYPE_S64: hTy = TYPE_S32; break; |
| default: |
| return false; |
| } |
| unsigned int fullSize = typeSizeof(fTy); |
| unsigned int halfSize = typeSizeof(hTy); |
| |
| Instruction *i[9]; |
| |
| bld->setPosition(mul, true); |
| |
| Value *a[2], *b[2]; |
| Value *c[2]; |
| Value *t[4]; |
| for (int j = 0; j < 4; ++j) |
| t[j] = bld->getSSA(fullSize); |
| |
| // split sources into halves |
| i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0)); |
| i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1)); |
| |
| i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]); |
| i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); |
| i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8)); |
| i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); |
| |
| if (highResult) { |
| Value *r[3]; |
| Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8)); |
| c[0] = bld->getSSA(1, FILE_FLAGS); |
| c[1] = bld->getSSA(1, FILE_FLAGS); |
| for (int j = 0; j < 3; ++j) |
| r[j] = bld->getSSA(fullSize); |
| |
| i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8)); |
| i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm); |
| bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]); |
| i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]); |
| |
| // set carry defs / sources |
| i[3]->setFlagsDef(1, c[0]); |
| i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry |
| i[6]->setPredicate(CC_C, c[0]); |
| i[5]->setFlagsSrc(3, c[1]); |
| } else { |
| bld->mkMov(mul->getDef(0), t[3]); |
| } |
| delete_Instruction(bld->getProgram(), mul); |
| |
| for (int j = 2; j <= (highResult ? 5 : 4); ++j) |
| if (i[j]) |
| i[j]->sType = hTy; |
| |
| return true; |
| } |
| |
| #define QOP_ADD 0 |
| #define QOP_SUBR 1 |
| #define QOP_SUB 2 |
| #define QOP_MOV2 3 |
| |
| // UL UR LL LR |
| #define QUADOP(q, r, s, t) \ |
| ((QOP_##q << 6) | (QOP_##r << 4) | \ |
| (QOP_##s << 2) | (QOP_##t << 0)) |
| |
| class NV50LegalizePostRA : public Pass |
| { |
| private: |
| virtual bool visit(Function *); |
| virtual bool visit(BasicBlock *); |
| |
| void handlePRERET(FlowInstruction *); |
| void replaceZero(Instruction *); |
| void split64BitOp(Instruction *); |
| |
| LValue *r63; |
| }; |
| |
| bool |
| NV50LegalizePostRA::visit(Function *fn) |
| { |
| Program *prog = fn->getProgram(); |
| |
| r63 = new_LValue(fn, FILE_GPR); |
| r63->reg.data.id = 63; |
| |
| // this is actually per-program, but we can do it all on visiting main() |
| std::list<Instruction *> *outWrites = |
| reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); |
| |
| if (outWrites) { |
| for (std::list<Instruction *>::iterator it = outWrites->begin(); |
| it != outWrites->end(); ++it) |
| (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0)); |
| // instructions will be deleted on exit |
| outWrites->clear(); |
| } |
| |
| return true; |
| } |
| |
| void |
| NV50LegalizePostRA::replaceZero(Instruction *i) |
| { |
| for (int s = 0; i->srcExists(s); ++s) { |
| ImmediateValue *imm = i->getSrc(s)->asImm(); |
| if (imm && imm->reg.data.u64 == 0) |
| i->setSrc(s, r63); |
| } |
| } |
| |
| void |
| NV50LegalizePostRA::split64BitOp(Instruction *i) |
| { |
| if (i->dType == TYPE_F64) { |
| if (i->op == OP_MAD) |
| i->op = OP_FMA; |
| if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA || |
| i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX || |
| i->op == OP_SET) |
| return; |
| i->dType = i->sType = TYPE_U32; |
| |
| i->bb->insertAfter(i, cloneForward(func, i)); |
| } |
| } |
| |
| // Emulate PRERET: jump to the target and call to the origin from there |
| // |
| // WARNING: atm only works if BBs are affected by at most a single PRERET |
| // |
| // BB:0 |
| // preret BB:3 |
| // (...) |
| // BB:3 |
| // (...) |
| // ---> |
| // BB:0 |
| // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate) |
| // (...) |
| // BB:3 |
| // bra BB:3 + n1 (skip the call) |
| // call BB:0 + n2 (skip bra at beginning of BB:0) |
| // (...) |
| void |
| NV50LegalizePostRA::handlePRERET(FlowInstruction *pre) |
| { |
| BasicBlock *bbE = pre->bb; |
| BasicBlock *bbT = pre->target.bb; |
| |
| pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0; |
| bbE->remove(pre); |
| bbE->insertHead(pre); |
| |
| Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT); |
| Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE); |
| |
| bbT->insertHead(call); |
| bbT->insertHead(skip); |
| |
| // NOTE: maybe split blocks to prevent the instructions from moving ? |
| |
| skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1; |
| call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2; |
| } |
| |
| bool |
| NV50LegalizePostRA::visit(BasicBlock *bb) |
| { |
| Instruction *i, *next; |
| |
| // remove pseudo operations and non-fixed no-ops, split 64 bit operations |
| for (i = bb->getFirst(); i; i = next) { |
| next = i->next; |
| if (i->isNop()) { |
| bb->remove(i); |
| } else |
| if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) { |
| handlePRERET(i->asFlow()); |
| } else { |
| if (i->op != OP_MOV && i->op != OP_PFETCH && |
| (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS)) |
| replaceZero(i); |
| if (typeSizeof(i->dType) == 8) |
| split64BitOp(i); |
| } |
| } |
| if (!bb->getEntry()) |
| return true; |
| |
| return true; |
| } |
| |
| class NV50LegalizeSSA : public Pass |
| { |
| public: |
| NV50LegalizeSSA(Program *); |
| |
| virtual bool visit(BasicBlock *bb); |
| |
| private: |
| void propagateWriteToOutput(Instruction *); |
| void handleDIV(Instruction *); |
| void handleMOD(Instruction *); |
| void handleMUL(Instruction *); |
| void handleAddrDef(Instruction *); |
| |
| inline bool isARL(const Instruction *) const; |
| |
| BuildUtil bld; |
| |
| std::list<Instruction *> *outWrites; |
| }; |
| |
| NV50LegalizeSSA::NV50LegalizeSSA(Program *prog) |
| { |
| bld.setProgram(prog); |
| |
| if (prog->optLevel >= 2 && |
| (prog->getType() == Program::TYPE_GEOMETRY || |
| prog->getType() == Program::TYPE_VERTEX)) |
| outWrites = |
| reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); |
| else |
| outWrites = NULL; |
| } |
| |
| void |
| NV50LegalizeSSA::propagateWriteToOutput(Instruction *st) |
| { |
| if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1) |
| return; |
| |
| // check def instruction can store |
| Instruction *di = st->getSrc(1)->defs.front()->getInsn(); |
| |
| // TODO: move exports (if beneficial) in common opt pass |
| if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1) |
| return; |
| for (int s = 0; di->srcExists(s); ++s) |
| if (di->src(s).getFile() == FILE_IMMEDIATE) |
| return; |
| |
| // We cannot set defs to non-lvalues before register allocation, so |
| // save & remove (to save registers) the exports and replace later. |
| outWrites->push_back(st); |
| st->bb->remove(st); |
| } |
| |
| bool |
| NV50LegalizeSSA::isARL(const Instruction *i) const |
| { |
| ImmediateValue imm; |
| |
| if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR) |
| return false; |
| if (!i->src(1).getImmediate(imm)) |
| return false; |
| return imm.isInteger(0); |
| } |
| |
| void |
| NV50LegalizeSSA::handleAddrDef(Instruction *i) |
| { |
| Instruction *arl; |
| |
| i->getDef(0)->reg.size = 2; // $aX are only 16 bit |
| |
| // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid |
| if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) { |
| if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) |
| return; |
| if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS) |
| return; |
| } |
| |
| // turn $a sources into $r sources (can't operate on $a) |
| for (int s = 0; i->srcExists(s); ++s) { |
| Value *a = i->getSrc(s); |
| Value *r; |
| if (a->reg.file == FILE_ADDRESS) { |
| if (a->getInsn() && isARL(a->getInsn())) { |
| i->setSrc(s, a->getInsn()->getSrc(0)); |
| } else { |
| bld.setPosition(i, false); |
| r = bld.getSSA(); |
| bld.mkMov(r, a); |
| i->setSrc(s, r); |
| } |
| } |
| } |
| if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE) |
| return; |
| |
| // turn result back into $a |
| bld.setPosition(i, true); |
| arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0)); |
| i->setDef(0, arl->getSrc(0)); |
| } |
| |
| void |
| NV50LegalizeSSA::handleMUL(Instruction *mul) |
| { |
| if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2) |
| return; |
| Value *def = mul->getDef(0); |
| Value *pred = mul->getPredicate(); |
| CondCode cc = mul->cc; |
| if (pred) |
| mul->setPredicate(CC_ALWAYS, NULL); |
| |
| if (mul->op == OP_MAD) { |
| Instruction *add = mul; |
| bld.setPosition(add, false); |
| Value *res = cloneShallow(func, mul->getDef(0)); |
| mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1)); |
| add->op = OP_ADD; |
| add->setSrc(0, mul->getDef(0)); |
| add->setSrc(1, add->getSrc(2)); |
| for (int s = 2; add->srcExists(s); ++s) |
| add->setSrc(s, NULL); |
| mul->subOp = add->subOp; |
| add->subOp = 0; |
| } |
| expandIntegerMUL(&bld, mul); |
| if (pred) |
| def->getInsn()->setPredicate(cc, pred); |
| } |
| |
| // Use f32 division: first compute an approximate result, use it to reduce |
| // the dividend, which should then be representable as f32, divide the reduced |
| // dividend, and add the quotients. |
| void |
| NV50LegalizeSSA::handleDIV(Instruction *div) |
| { |
| const DataType ty = div->sType; |
| |
| if (ty != TYPE_U32 && ty != TYPE_S32) |
| return; |
| |
| Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond; |
| |
| bld.setPosition(div, false); |
| |
| Value *a, *af = bld.getSSA(); |
| Value *b, *bf = bld.getSSA(); |
| |
| bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0)); |
| bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1)); |
| |
| if (isSignedType(ty)) { |
| af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); |
| bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); |
| a = bld.getSSA(); |
| b = bld.getSSA(); |
| bld.mkOp1(OP_ABS, ty, a, div->getSrc(0)); |
| bld.mkOp1(OP_ABS, ty, b, div->getSrc(1)); |
| } else { |
| a = div->getSrc(0); |
| b = div->getSrc(1); |
| } |
| |
| bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf); |
| bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2)); |
| |
| bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z; |
| bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z; |
| |
| // get error of 1st result |
| expandIntegerMUL(&bld, |
| bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b)); |
| bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t); |
| |
| bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf); |
| |
| bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z; |
| bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf) |
| ->rnd = ROUND_Z; |
| bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients |
| |
| // correction: if modulus >= divisor, add 1 |
| expandIntegerMUL(&bld, |
| bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b)); |
| bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t); |
| bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b); |
| if (!isSignedType(ty)) { |
| div->op = OP_SUB; |
| div->setSrc(0, q); |
| div->setSrc(1, s); |
| } else { |
| t = q; |
| bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s); |
| s = bld.getSSA(); |
| t = bld.getSSA(); |
| // fix the sign |
| bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1)) |
| ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS))); |
| bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond); |
| bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond); |
| |
| div->op = OP_UNION; |
| div->setSrc(0, s); |
| div->setSrc(1, t); |
| } |
| } |
| |
| void |
| NV50LegalizeSSA::handleMOD(Instruction *mod) |
| { |
| if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32) |
| return; |
| bld.setPosition(mod, false); |
| |
| Value *q = bld.getSSA(); |
| Value *m = bld.getSSA(); |
| |
| bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1)); |
| handleDIV(q->getInsn()); |
| |
| bld.setPosition(mod, false); |
| expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1))); |
| |
| mod->op = OP_SUB; |
| mod->setSrc(1, m); |
| } |
| |
| bool |
| NV50LegalizeSSA::visit(BasicBlock *bb) |
| { |
| Instruction *insn, *next; |
| // skipping PHIs (don't pass them to handleAddrDef) ! |
| for (insn = bb->getEntry(); insn; insn = next) { |
| next = insn->next; |
| |
| switch (insn->op) { |
| case OP_EXPORT: |
| if (outWrites) |
| propagateWriteToOutput(insn); |
| break; |
| case OP_DIV: |
| handleDIV(insn); |
| break; |
| case OP_MOD: |
| handleMOD(insn); |
| break; |
| case OP_MAD: |
| case OP_MUL: |
| handleMUL(insn); |
| break; |
| default: |
| break; |
| } |
| |
| if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS) |
| handleAddrDef(insn); |
| } |
| return true; |
| } |
| |
| class NV50LoweringPreSSA : public Pass |
| { |
| public: |
| NV50LoweringPreSSA(Program *); |
| |
| private: |
| virtual bool visit(Instruction *); |
| virtual bool visit(Function *); |
| |
| bool handleRDSV(Instruction *); |
| bool handleWRSV(Instruction *); |
| |
| bool handleEXPORT(Instruction *); |
| |
| bool handleDIV(Instruction *); |
| bool handleSQRT(Instruction *); |
| bool handlePOW(Instruction *); |
| |
| bool handleSET(Instruction *); |
| bool handleSLCT(CmpInstruction *); |
| bool handleSELP(Instruction *); |
| |
| bool handleTEX(TexInstruction *); |
| bool handleTXB(TexInstruction *); // I really |
| bool handleTXL(TexInstruction *); // hate |
| bool handleTXD(TexInstruction *); // these 3 |
| |
| bool handleCALL(Instruction *); |
| bool handlePRECONT(Instruction *); |
| bool handleCONT(Instruction *); |
| |
| void checkPredicate(Instruction *); |
| |
| private: |
| const Target *const targ; |
| |
| BuildUtil bld; |
| |
| Value *tid; |
| }; |
| |
| NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) : |
| targ(prog->getTarget()), tid(NULL) |
| { |
| bld.setProgram(prog); |
| } |
| |
| bool |
| NV50LoweringPreSSA::visit(Function *f) |
| { |
| BasicBlock *root = BasicBlock::get(func->cfg.getRoot()); |
| |
| if (prog->getType() == Program::TYPE_COMPUTE) { |
| // Add implicit "thread id" argument in $r0 to the function |
| Value *arg = new_LValue(func, FILE_GPR); |
| arg->reg.data.id = 0; |
| f->ins.push_back(arg); |
| |
| bld.setPosition(root, false); |
| tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0); |
| } |
| |
| return true; |
| } |
| |
| // move array source to first slot, convert to u16, add indirections |
| bool |
| NV50LoweringPreSSA::handleTEX(TexInstruction *i) |
| { |
| const int arg = i->tex.target.getArgCount(); |
| const int dref = arg; |
| const int lod = i->tex.target.isShadow() ? (arg + 1) : arg; |
| |
| // dref comes before bias/lod |
| if (i->tex.target.isShadow()) |
| if (i->op == OP_TXB || i->op == OP_TXL) |
| i->swapSources(dref, lod); |
| |
| // array index must be converted to u32 |
| if (i->tex.target.isArray()) { |
| Value *layer = i->getSrc(arg - 1); |
| LValue *src = new_LValue(func, FILE_GPR); |
| bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer); |
| bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511)); |
| i->setSrc(arg - 1, src); |
| |
| if (i->tex.target.isCube()) { |
| // Value *face = layer; |
| Value *x, *y; |
| x = new_LValue(func, FILE_GPR); |
| y = new_LValue(func, FILE_GPR); |
| layer = new_LValue(func, FILE_GPR); |
| |
| i->tex.target = TEX_TARGET_2D_ARRAY; |
| |
| // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer |
| bld.mkMov(x, i->getSrc(0)); |
| bld.mkMov(y, i->getSrc(1)); |
| bld.mkMov(layer, i->getSrc(3)); |
| |
| i->setSrc(0, x); |
| i->setSrc(1, y); |
| i->setSrc(2, layer); |
| i->setSrc(3, i->getSrc(4)); |
| i->setSrc(4, NULL); |
| } |
| } |
| |
| // texel offsets are 3 immediate fields in the instruction, |
| // nv50 cannot do textureGatherOffsets |
| assert(i->tex.useOffsets <= 1); |
| |
| return true; |
| } |
| |
| // Bias must be equal for all threads of a quad or lod calculation will fail. |
| // |
| // The lanes of a quad are grouped by the bit in the condition register they |
| // have set, which is selected by differing bias values. |
| // Move the input values for TEX into a new register set for each group and |
| // execute TEX only for a specific group. |
| // We always need to use 4 new registers for the inputs/outputs because the |
| // implicitly calculated derivatives must be correct. |
| // |
| // TODO: move to SSA phase so we can easily determine whether bias is constant |
| bool |
| NV50LoweringPreSSA::handleTXB(TexInstruction *i) |
| { |
| const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O }; |
| int l, d; |
| |
| handleTEX(i); |
| Value *bias = i->getSrc(i->tex.target.getArgCount()); |
| if (bias->isUniform()) |
| return true; |
| |
| Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(), |
| bld.loadImm(NULL, 1)); |
| bld.setPosition(cond, false); |
| |
| for (l = 1; l < 4; ++l) { |
| const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); |
| Value *bit = bld.getSSA(); |
| Value *pred = bld.getScratch(1, FILE_FLAGS); |
| Value *imm = bld.loadImm(NULL, (1 << l)); |
| bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0; |
| bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred); |
| cond->setSrc(l, bit); |
| } |
| Value *flags = bld.getScratch(1, FILE_FLAGS); |
| bld.setPosition(cond, true); |
| bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0)); |
| |
| Instruction *tex[4]; |
| for (l = 0; l < 4; ++l) { |
| (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags); |
| bld.insert(tex[l]); |
| } |
| |
| Value *res[4][4]; |
| for (d = 0; i->defExists(d); ++d) |
| res[0][d] = tex[0]->getDef(d); |
| for (l = 1; l < 4; ++l) { |
| for (d = 0; tex[l]->defExists(d); ++d) { |
| res[l][d] = cloneShallow(func, res[0][d]); |
| bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags); |
| } |
| } |
| |
| for (d = 0; i->defExists(d); ++d) { |
| Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d)); |
| for (l = 0; l < 4; ++l) |
| dst->setSrc(l, res[l][d]); |
| } |
| delete_Instruction(prog, i); |
| return true; |
| } |
| |
| // LOD must be equal for all threads of a quad. |
| // Unlike with TXB, here we can just diverge since there's no LOD calculation |
| // that would require all 4 threads' sources to be set up properly. |
| bool |
| NV50LoweringPreSSA::handleTXL(TexInstruction *i) |
| { |
| handleTEX(i); |
| Value *lod = i->getSrc(i->tex.target.getArgCount()); |
| if (lod->isUniform()) |
| return true; |
| |
| BasicBlock *currBB = i->bb; |
| BasicBlock *texiBB = i->bb->splitBefore(i, false); |
| BasicBlock *joinBB = i->bb->splitAfter(i); |
| |
| bld.setPosition(currBB, true); |
| currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); |
| |
| for (int l = 0; l <= 3; ++l) { |
| const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); |
| Value *pred = bld.getScratch(1, FILE_FLAGS); |
| bld.setPosition(currBB, true); |
| bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0; |
| bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1; |
| currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD); |
| if (l <= 2) { |
| BasicBlock *laneBB = new BasicBlock(func); |
| currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE); |
| currBB = laneBB; |
| } |
| } |
| bld.setPosition(joinBB, false); |
| bld.mkOp(OP_JOIN, TYPE_NONE, NULL); |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleTXD(TexInstruction *i) |
| { |
| static const uint8_t qOps[4][2] = |
| { |
| { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 |
| { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 |
| { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 |
| { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 |
| }; |
| Value *def[4][4]; |
| Value *crd[3]; |
| Instruction *tex; |
| Value *zero = bld.loadImm(bld.getSSA(), 0); |
| int l, c; |
| const int dim = i->tex.target.getDim(); |
| |
| handleTEX(i); |
| i->op = OP_TEX; // no need to clone dPdx/dPdy later |
| |
| for (c = 0; c < dim; ++c) |
| crd[c] = bld.getScratch(); |
| |
| bld.mkOp(OP_QUADON, TYPE_NONE, NULL); |
| for (l = 0; l < 4; ++l) { |
| // mov coordinates from lane l to all lanes |
| for (c = 0; c < dim; ++c) |
| bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); |
| // add dPdx from lane l to lanes dx |
| for (c = 0; c < dim; ++c) |
| bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); |
| // add dPdy from lane l to lanes dy |
| for (c = 0; c < dim; ++c) |
| bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); |
| // texture |
| bld.insert(tex = cloneForward(func, i)); |
| for (c = 0; c < dim; ++c) |
| tex->setSrc(c, crd[c]); |
| // save results |
| for (c = 0; i->defExists(c); ++c) { |
| Instruction *mov; |
| def[c][l] = bld.getSSA(); |
| mov = bld.mkMov(def[c][l], tex->getDef(c)); |
| mov->fixed = 1; |
| mov->lanes = 1 << l; |
| } |
| } |
| bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); |
| |
| for (c = 0; i->defExists(c); ++c) { |
| Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); |
| for (l = 0; l < 4; ++l) |
| u->setSrc(l, def[c][l]); |
| } |
| |
| i->bb->remove(i); |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleSET(Instruction *i) |
| { |
| if (i->dType == TYPE_F32) { |
| bld.setPosition(i, true); |
| i->dType = TYPE_U32; |
| bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0)); |
| bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0)); |
| } |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleSLCT(CmpInstruction *i) |
| { |
| Value *src0 = bld.getSSA(); |
| Value *src1 = bld.getSSA(); |
| Value *pred = bld.getScratch(1, FILE_FLAGS); |
| |
| Value *v0 = i->getSrc(0); |
| Value *v1 = i->getSrc(1); |
| // XXX: these probably shouldn't be immediates in the first place ... |
| if (v0->asImm()) |
| v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); |
| if (v1->asImm()) |
| v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); |
| |
| bld.setPosition(i, true); |
| bld.mkMov(src0, v0)->setPredicate(CC_NE, pred); |
| bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred); |
| bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); |
| |
| bld.setPosition(i, false); |
| i->op = OP_SET; |
| i->setFlagsDef(0, pred); |
| i->dType = TYPE_U8; |
| i->setSrc(0, i->getSrc(2)); |
| i->setSrc(2, NULL); |
| i->setSrc(1, bld.loadImm(NULL, 0)); |
| |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleSELP(Instruction *i) |
| { |
| Value *src0 = bld.getSSA(); |
| Value *src1 = bld.getSSA(); |
| |
| Value *v0 = i->getSrc(0); |
| Value *v1 = i->getSrc(1); |
| if (v0->asImm()) |
| v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); |
| if (v1->asImm()) |
| v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); |
| |
| bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2)); |
| bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2)); |
| bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); |
| delete_Instruction(prog, i); |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleWRSV(Instruction *i) |
| { |
| Symbol *sym = i->getSrc(0)->asSym(); |
| |
| // these are all shader outputs, $sreg are not writeable |
| uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym); |
| if (addr >= 0x400) |
| return false; |
| sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); |
| |
| bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1)); |
| |
| bld.getBB()->remove(i); |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleCALL(Instruction *i) |
| { |
| if (prog->getType() == Program::TYPE_COMPUTE) { |
| // Add implicit "thread id" argument in $r0 to the function |
| i->setSrc(i->srcCount(), tid); |
| } |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handlePRECONT(Instruction *i) |
| { |
| delete_Instruction(prog, i); |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleCONT(Instruction *i) |
| { |
| i->op = OP_BRA; |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleRDSV(Instruction *i) |
| { |
| Symbol *sym = i->getSrc(0)->asSym(); |
| uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); |
| Value *def = i->getDef(0); |
| SVSemantic sv = sym->reg.data.sv.sv; |
| int idx = sym->reg.data.sv.index; |
| |
| if (addr >= 0x400) // mov $sreg |
| return true; |
| |
| switch (sv) { |
| case SV_POSITION: |
| assert(prog->getType() == Program::TYPE_FRAGMENT); |
| bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL); |
| break; |
| case SV_FACE: |
| bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL); |
| if (i->dType == TYPE_F32) { |
| bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000)); |
| bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000)); |
| } |
| break; |
| case SV_NCTAID: |
| case SV_CTAID: |
| case SV_NTID: |
| if ((sv == SV_NCTAID && idx >= 2) || |
| (sv == SV_NTID && idx >= 3)) { |
| bld.mkMov(def, bld.mkImm(1)); |
| } else if (sv == SV_CTAID && idx >= 2) { |
| bld.mkMov(def, bld.mkImm(0)); |
| } else { |
| Value *x = bld.getSSA(2); |
| bld.mkOp1(OP_LOAD, TYPE_U16, x, |
| bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr)); |
| bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x); |
| } |
| break; |
| case SV_TID: |
| if (idx == 0) { |
| bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff)); |
| } else if (idx == 1) { |
| bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000)); |
| bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16)); |
| } else if (idx == 2) { |
| bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26)); |
| } else { |
| bld.mkMov(def, bld.mkImm(0)); |
| } |
| break; |
| default: |
| bld.mkFetch(i->getDef(0), i->dType, |
| FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL); |
| break; |
| } |
| bld.getBB()->remove(i); |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleDIV(Instruction *i) |
| { |
| if (!isFloatType(i->dType)) |
| return true; |
| bld.setPosition(i, false); |
| Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); |
| i->op = OP_MUL; |
| i->setSrc(1, rcp->getDef(0)); |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleSQRT(Instruction *i) |
| { |
| Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, |
| bld.getSSA(), i->getSrc(0)); |
| i->op = OP_MUL; |
| i->setSrc(1, rsq->getDef(0)); |
| |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handlePOW(Instruction *i) |
| { |
| LValue *val = bld.getScratch(); |
| |
| bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); |
| bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; |
| bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); |
| |
| i->op = OP_EX2; |
| i->setSrc(0, val); |
| i->setSrc(1, NULL); |
| |
| return true; |
| } |
| |
| bool |
| NV50LoweringPreSSA::handleEXPORT(Instruction *i) |
| { |
| if (prog->getType() == Program::TYPE_FRAGMENT) { |
| if (i->getIndirect(0, 0)) { |
| // TODO: redirect to l[] here, load to GPRs at exit |
| return false; |
| } else { |
| int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units |
| |
| i->op = OP_MOV; |
| i->subOp = NV50_IR_SUBOP_MOV_FINAL; |
| i->src(0).set(i->src(1)); |
| i->setSrc(1, NULL); |
| i->setDef(0, new_LValue(func, FILE_GPR)); |
| i->getDef(0)->reg.data.id = id; |
| |
| prog->maxGPR = MAX2(prog->maxGPR, id); |
| } |
| } |
| return true; |
| } |
| |
| // Set flags according to predicate and make the instruction read $cX. |
| void |
| NV50LoweringPreSSA::checkPredicate(Instruction *insn) |
| { |
| Value *pred = insn->getPredicate(); |
| Value *cdst; |
| |
| if (!pred || pred->reg.file == FILE_FLAGS) |
| return; |
| cdst = bld.getSSA(1, FILE_FLAGS); |
| |
| bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred); |
| |
| insn->setPredicate(insn->cc, cdst); |
| } |
| |
| // |
| // - add quadop dance for texturing |
| // - put FP outputs in GPRs |
| // - convert instruction sequences |
| // |
| bool |
| NV50LoweringPreSSA::visit(Instruction *i) |
| { |
| bld.setPosition(i, false); |
| |
| if (i->cc != CC_ALWAYS) |
| checkPredicate(i); |
| |
| switch (i->op) { |
| case OP_TEX: |
| case OP_TXF: |
| case OP_TXG: |
| return handleTEX(i->asTex()); |
| case OP_TXB: |
| return handleTXB(i->asTex()); |
| case OP_TXL: |
| return handleTXL(i->asTex()); |
| case OP_TXD: |
| return handleTXD(i->asTex()); |
| case OP_EX2: |
| bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); |
| i->setSrc(0, i->getDef(0)); |
| break; |
| case OP_SET: |
| return handleSET(i); |
| case OP_SLCT: |
| return handleSLCT(i->asCmp()); |
| case OP_SELP: |
| return handleSELP(i); |
| case OP_POW: |
| return handlePOW(i); |
| case OP_DIV: |
| return handleDIV(i); |
| case OP_SQRT: |
| return handleSQRT(i); |
| case OP_EXPORT: |
| return handleEXPORT(i); |
| case OP_RDSV: |
| return handleRDSV(i); |
| case OP_WRSV: |
| return handleWRSV(i); |
| case OP_CALL: |
| return handleCALL(i); |
| case OP_PRECONT: |
| return handlePRECONT(i); |
| case OP_CONT: |
| return handleCONT(i); |
| default: |
| break; |
| } |
| return true; |
| } |
| |
| bool |
| TargetNV50::runLegalizePass(Program *prog, CGStage stage) const |
| { |
| bool ret = false; |
| |
| if (stage == CG_STAGE_PRE_SSA) { |
| NV50LoweringPreSSA pass(prog); |
| ret = pass.run(prog, false, true); |
| } else |
| if (stage == CG_STAGE_SSA) { |
| if (!prog->targetPriv) |
| prog->targetPriv = new std::list<Instruction *>(); |
| NV50LegalizeSSA pass(prog); |
| ret = pass.run(prog, false, true); |
| } else |
| if (stage == CG_STAGE_POST_RA) { |
| NV50LegalizePostRA pass; |
| ret = pass.run(prog, false, true); |
| if (prog->targetPriv) |
| delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); |
| } |
| return ret; |
| } |
| |
| } // namespace nv50_ir |