blob: 3e0dc755d5671d62a3627370f9cf6e2516aebadc [file] [log] [blame]
/*
* Copyright 2011 Christoph Bumiller
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
* OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "nv50/codegen/nv50_ir.h"
#include "nv50/codegen/nv50_ir_build_util.h"
#include "nv50_ir_target_nv50.h"
namespace nv50_ir {
// nv50 doesn't support 32 bit integer multiplication
//
// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
// -------------------
// al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
// ah*bh 00 00 ( carry1) << 16 + ( carry2)
// al*bl
// ah*bl 00
//
// fffe0001 + fffe0001
static bool
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
{
const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
DataType fTy = mul->sType; // full type
DataType hTy;
switch (fTy) {
case TYPE_S32: hTy = TYPE_S16; break;
case TYPE_U32: hTy = TYPE_U16; break;
case TYPE_U64: hTy = TYPE_U32; break;
case TYPE_S64: hTy = TYPE_S32; break;
default:
return false;
}
unsigned int fullSize = typeSizeof(fTy);
unsigned int halfSize = typeSizeof(hTy);
Instruction *i[9];
bld->setPosition(mul, true);
Value *a[2], *b[2];
Value *c[2];
Value *t[4];
for (int j = 0; j < 4; ++j)
t[j] = bld->getSSA(fullSize);
// split sources into halves
i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
if (highResult) {
Value *r[3];
Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
c[0] = bld->getSSA(1, FILE_FLAGS);
c[1] = bld->getSSA(1, FILE_FLAGS);
for (int j = 0; j < 3; ++j)
r[j] = bld->getSSA(fullSize);
i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
// set carry defs / sources
i[3]->setFlagsDef(1, c[0]);
i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
i[6]->setPredicate(CC_C, c[0]);
i[5]->setFlagsSrc(3, c[1]);
} else {
bld->mkMov(mul->getDef(0), t[3]);
}
delete_Instruction(bld->getProgram(), mul);
for (int j = 2; j <= (highResult ? 5 : 4); ++j)
if (i[j])
i[j]->sType = hTy;
return true;
}
#define QOP_ADD 0
#define QOP_SUBR 1
#define QOP_SUB 2
#define QOP_MOV2 3
// UL UR LL LR
#define QUADOP(q, r, s, t) \
((QOP_##q << 6) | (QOP_##r << 4) | \
(QOP_##s << 2) | (QOP_##t << 0))
class NV50LegalizePostRA : public Pass
{
private:
virtual bool visit(Function *);
virtual bool visit(BasicBlock *);
void handlePRERET(FlowInstruction *);
void replaceZero(Instruction *);
void split64BitOp(Instruction *);
LValue *r63;
};
bool
NV50LegalizePostRA::visit(Function *fn)
{
Program *prog = fn->getProgram();
r63 = new_LValue(fn, FILE_GPR);
r63->reg.data.id = 63;
// this is actually per-program, but we can do it all on visiting main()
std::list<Instruction *> *outWrites =
reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
if (outWrites) {
for (std::list<Instruction *>::iterator it = outWrites->begin();
it != outWrites->end(); ++it)
(*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
// instructions will be deleted on exit
outWrites->clear();
}
return true;
}
void
NV50LegalizePostRA::replaceZero(Instruction *i)
{
for (int s = 0; i->srcExists(s); ++s) {
ImmediateValue *imm = i->getSrc(s)->asImm();
if (imm && imm->reg.data.u64 == 0)
i->setSrc(s, r63);
}
}
void
NV50LegalizePostRA::split64BitOp(Instruction *i)
{
if (i->dType == TYPE_F64) {
if (i->op == OP_MAD)
i->op = OP_FMA;
if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
i->op == OP_SET)
return;
i->dType = i->sType = TYPE_U32;
i->bb->insertAfter(i, cloneForward(func, i));
}
}
// Emulate PRERET: jump to the target and call to the origin from there
//
// WARNING: atm only works if BBs are affected by at most a single PRERET
//
// BB:0
// preret BB:3
// (...)
// BB:3
// (...)
// --->
// BB:0
// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
// (...)
// BB:3
// bra BB:3 + n1 (skip the call)
// call BB:0 + n2 (skip bra at beginning of BB:0)
// (...)
void
NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
{
BasicBlock *bbE = pre->bb;
BasicBlock *bbT = pre->target.bb;
pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
bbE->remove(pre);
bbE->insertHead(pre);
Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
bbT->insertHead(call);
bbT->insertHead(skip);
// NOTE: maybe split blocks to prevent the instructions from moving ?
skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
}
bool
NV50LegalizePostRA::visit(BasicBlock *bb)
{
Instruction *i, *next;
// remove pseudo operations and non-fixed no-ops, split 64 bit operations
for (i = bb->getFirst(); i; i = next) {
next = i->next;
if (i->isNop()) {
bb->remove(i);
} else
if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
handlePRERET(i->asFlow());
} else {
if (i->op != OP_MOV && i->op != OP_PFETCH &&
(!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
replaceZero(i);
if (typeSizeof(i->dType) == 8)
split64BitOp(i);
}
}
if (!bb->getEntry())
return true;
return true;
}
class NV50LegalizeSSA : public Pass
{
public:
NV50LegalizeSSA(Program *);
virtual bool visit(BasicBlock *bb);
private:
void propagateWriteToOutput(Instruction *);
void handleDIV(Instruction *);
void handleMOD(Instruction *);
void handleMUL(Instruction *);
void handleAddrDef(Instruction *);
inline bool isARL(const Instruction *) const;
BuildUtil bld;
std::list<Instruction *> *outWrites;
};
NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
{
bld.setProgram(prog);
if (prog->optLevel >= 2 &&
(prog->getType() == Program::TYPE_GEOMETRY ||
prog->getType() == Program::TYPE_VERTEX))
outWrites =
reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
else
outWrites = NULL;
}
void
NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
{
if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
return;
// check def instruction can store
Instruction *di = st->getSrc(1)->defs.front()->getInsn();
// TODO: move exports (if beneficial) in common opt pass
if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
return;
for (int s = 0; di->srcExists(s); ++s)
if (di->src(s).getFile() == FILE_IMMEDIATE)
return;
// We cannot set defs to non-lvalues before register allocation, so
// save & remove (to save registers) the exports and replace later.
outWrites->push_back(st);
st->bb->remove(st);
}
bool
NV50LegalizeSSA::isARL(const Instruction *i) const
{
ImmediateValue imm;
if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
return false;
if (!i->src(1).getImmediate(imm))
return false;
return imm.isInteger(0);
}
void
NV50LegalizeSSA::handleAddrDef(Instruction *i)
{
Instruction *arl;
i->getDef(0)->reg.size = 2; // $aX are only 16 bit
// only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
return;
if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
return;
}
// turn $a sources into $r sources (can't operate on $a)
for (int s = 0; i->srcExists(s); ++s) {
Value *a = i->getSrc(s);
Value *r;
if (a->reg.file == FILE_ADDRESS) {
if (a->getInsn() && isARL(a->getInsn())) {
i->setSrc(s, a->getInsn()->getSrc(0));
} else {
bld.setPosition(i, false);
r = bld.getSSA();
bld.mkMov(r, a);
i->setSrc(s, r);
}
}
}
if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
return;
// turn result back into $a
bld.setPosition(i, true);
arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
i->setDef(0, arl->getSrc(0));
}
void
NV50LegalizeSSA::handleMUL(Instruction *mul)
{
if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
return;
Value *def = mul->getDef(0);
Value *pred = mul->getPredicate();
CondCode cc = mul->cc;
if (pred)
mul->setPredicate(CC_ALWAYS, NULL);
if (mul->op == OP_MAD) {
Instruction *add = mul;
bld.setPosition(add, false);
Value *res = cloneShallow(func, mul->getDef(0));
mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
add->op = OP_ADD;
add->setSrc(0, mul->getDef(0));
add->setSrc(1, add->getSrc(2));
for (int s = 2; add->srcExists(s); ++s)
add->setSrc(s, NULL);
mul->subOp = add->subOp;
add->subOp = 0;
}
expandIntegerMUL(&bld, mul);
if (pred)
def->getInsn()->setPredicate(cc, pred);
}
// Use f32 division: first compute an approximate result, use it to reduce
// the dividend, which should then be representable as f32, divide the reduced
// dividend, and add the quotients.
void
NV50LegalizeSSA::handleDIV(Instruction *div)
{
const DataType ty = div->sType;
if (ty != TYPE_U32 && ty != TYPE_S32)
return;
Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
bld.setPosition(div, false);
Value *a, *af = bld.getSSA();
Value *b, *bf = bld.getSSA();
bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
if (isSignedType(ty)) {
af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
a = bld.getSSA();
b = bld.getSSA();
bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
} else {
a = div->getSrc(0);
b = div->getSrc(1);
}
bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
// get error of 1st result
expandIntegerMUL(&bld,
bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
->rnd = ROUND_Z;
bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
// correction: if modulus >= divisor, add 1
expandIntegerMUL(&bld,
bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
if (!isSignedType(ty)) {
div->op = OP_SUB;
div->setSrc(0, q);
div->setSrc(1, s);
} else {
t = q;
bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
s = bld.getSSA();
t = bld.getSSA();
// fix the sign
bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
div->op = OP_UNION;
div->setSrc(0, s);
div->setSrc(1, t);
}
}
void
NV50LegalizeSSA::handleMOD(Instruction *mod)
{
if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
return;
bld.setPosition(mod, false);
Value *q = bld.getSSA();
Value *m = bld.getSSA();
bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
handleDIV(q->getInsn());
bld.setPosition(mod, false);
expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
mod->op = OP_SUB;
mod->setSrc(1, m);
}
bool
NV50LegalizeSSA::visit(BasicBlock *bb)
{
Instruction *insn, *next;
// skipping PHIs (don't pass them to handleAddrDef) !
for (insn = bb->getEntry(); insn; insn = next) {
next = insn->next;
switch (insn->op) {
case OP_EXPORT:
if (outWrites)
propagateWriteToOutput(insn);
break;
case OP_DIV:
handleDIV(insn);
break;
case OP_MOD:
handleMOD(insn);
break;
case OP_MAD:
case OP_MUL:
handleMUL(insn);
break;
default:
break;
}
if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
handleAddrDef(insn);
}
return true;
}
class NV50LoweringPreSSA : public Pass
{
public:
NV50LoweringPreSSA(Program *);
private:
virtual bool visit(Instruction *);
virtual bool visit(Function *);
bool handleRDSV(Instruction *);
bool handleWRSV(Instruction *);
bool handleEXPORT(Instruction *);
bool handleDIV(Instruction *);
bool handleSQRT(Instruction *);
bool handlePOW(Instruction *);
bool handleSET(Instruction *);
bool handleSLCT(CmpInstruction *);
bool handleSELP(Instruction *);
bool handleTEX(TexInstruction *);
bool handleTXB(TexInstruction *); // I really
bool handleTXL(TexInstruction *); // hate
bool handleTXD(TexInstruction *); // these 3
bool handleCALL(Instruction *);
bool handlePRECONT(Instruction *);
bool handleCONT(Instruction *);
void checkPredicate(Instruction *);
private:
const Target *const targ;
BuildUtil bld;
Value *tid;
};
NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
targ(prog->getTarget()), tid(NULL)
{
bld.setProgram(prog);
}
bool
NV50LoweringPreSSA::visit(Function *f)
{
BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
if (prog->getType() == Program::TYPE_COMPUTE) {
// Add implicit "thread id" argument in $r0 to the function
Value *arg = new_LValue(func, FILE_GPR);
arg->reg.data.id = 0;
f->ins.push_back(arg);
bld.setPosition(root, false);
tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
}
return true;
}
// move array source to first slot, convert to u16, add indirections
bool
NV50LoweringPreSSA::handleTEX(TexInstruction *i)
{
const int arg = i->tex.target.getArgCount();
const int dref = arg;
const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
// dref comes before bias/lod
if (i->tex.target.isShadow())
if (i->op == OP_TXB || i->op == OP_TXL)
i->swapSources(dref, lod);
// array index must be converted to u32
if (i->tex.target.isArray()) {
Value *layer = i->getSrc(arg - 1);
LValue *src = new_LValue(func, FILE_GPR);
bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
i->setSrc(arg - 1, src);
if (i->tex.target.isCube()) {
// Value *face = layer;
Value *x, *y;
x = new_LValue(func, FILE_GPR);
y = new_LValue(func, FILE_GPR);
layer = new_LValue(func, FILE_GPR);
i->tex.target = TEX_TARGET_2D_ARRAY;
// TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
bld.mkMov(x, i->getSrc(0));
bld.mkMov(y, i->getSrc(1));
bld.mkMov(layer, i->getSrc(3));
i->setSrc(0, x);
i->setSrc(1, y);
i->setSrc(2, layer);
i->setSrc(3, i->getSrc(4));
i->setSrc(4, NULL);
}
}
// texel offsets are 3 immediate fields in the instruction,
// nv50 cannot do textureGatherOffsets
assert(i->tex.useOffsets <= 1);
return true;
}
// Bias must be equal for all threads of a quad or lod calculation will fail.
//
// The lanes of a quad are grouped by the bit in the condition register they
// have set, which is selected by differing bias values.
// Move the input values for TEX into a new register set for each group and
// execute TEX only for a specific group.
// We always need to use 4 new registers for the inputs/outputs because the
// implicitly calculated derivatives must be correct.
//
// TODO: move to SSA phase so we can easily determine whether bias is constant
bool
NV50LoweringPreSSA::handleTXB(TexInstruction *i)
{
const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
int l, d;
handleTEX(i);
Value *bias = i->getSrc(i->tex.target.getArgCount());
if (bias->isUniform())
return true;
Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
bld.loadImm(NULL, 1));
bld.setPosition(cond, false);
for (l = 1; l < 4; ++l) {
const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
Value *bit = bld.getSSA();
Value *pred = bld.getScratch(1, FILE_FLAGS);
Value *imm = bld.loadImm(NULL, (1 << l));
bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
cond->setSrc(l, bit);
}
Value *flags = bld.getScratch(1, FILE_FLAGS);
bld.setPosition(cond, true);
bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
Instruction *tex[4];
for (l = 0; l < 4; ++l) {
(tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
bld.insert(tex[l]);
}
Value *res[4][4];
for (d = 0; i->defExists(d); ++d)
res[0][d] = tex[0]->getDef(d);
for (l = 1; l < 4; ++l) {
for (d = 0; tex[l]->defExists(d); ++d) {
res[l][d] = cloneShallow(func, res[0][d]);
bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
}
}
for (d = 0; i->defExists(d); ++d) {
Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
for (l = 0; l < 4; ++l)
dst->setSrc(l, res[l][d]);
}
delete_Instruction(prog, i);
return true;
}
// LOD must be equal for all threads of a quad.
// Unlike with TXB, here we can just diverge since there's no LOD calculation
// that would require all 4 threads' sources to be set up properly.
bool
NV50LoweringPreSSA::handleTXL(TexInstruction *i)
{
handleTEX(i);
Value *lod = i->getSrc(i->tex.target.getArgCount());
if (lod->isUniform())
return true;
BasicBlock *currBB = i->bb;
BasicBlock *texiBB = i->bb->splitBefore(i, false);
BasicBlock *joinBB = i->bb->splitAfter(i);
bld.setPosition(currBB, true);
currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
for (int l = 0; l <= 3; ++l) {
const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
Value *pred = bld.getScratch(1, FILE_FLAGS);
bld.setPosition(currBB, true);
bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
if (l <= 2) {
BasicBlock *laneBB = new BasicBlock(func);
currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
currBB = laneBB;
}
}
bld.setPosition(joinBB, false);
bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
return true;
}
bool
NV50LoweringPreSSA::handleTXD(TexInstruction *i)
{
static const uint8_t qOps[4][2] =
{
{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
};
Value *def[4][4];
Value *crd[3];
Instruction *tex;
Value *zero = bld.loadImm(bld.getSSA(), 0);
int l, c;
const int dim = i->tex.target.getDim();
handleTEX(i);
i->op = OP_TEX; // no need to clone dPdx/dPdy later
for (c = 0; c < dim; ++c)
crd[c] = bld.getScratch();
bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
for (l = 0; l < 4; ++l) {
// mov coordinates from lane l to all lanes
for (c = 0; c < dim; ++c)
bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
// add dPdx from lane l to lanes dx
for (c = 0; c < dim; ++c)
bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
// add dPdy from lane l to lanes dy
for (c = 0; c < dim; ++c)
bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
// texture
bld.insert(tex = cloneForward(func, i));
for (c = 0; c < dim; ++c)
tex->setSrc(c, crd[c]);
// save results
for (c = 0; i->defExists(c); ++c) {
Instruction *mov;
def[c][l] = bld.getSSA();
mov = bld.mkMov(def[c][l], tex->getDef(c));
mov->fixed = 1;
mov->lanes = 1 << l;
}
}
bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
for (c = 0; i->defExists(c); ++c) {
Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
for (l = 0; l < 4; ++l)
u->setSrc(l, def[c][l]);
}
i->bb->remove(i);
return true;
}
bool
NV50LoweringPreSSA::handleSET(Instruction *i)
{
if (i->dType == TYPE_F32) {
bld.setPosition(i, true);
i->dType = TYPE_U32;
bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
}
return true;
}
bool
NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
{
Value *src0 = bld.getSSA();
Value *src1 = bld.getSSA();
Value *pred = bld.getScratch(1, FILE_FLAGS);
Value *v0 = i->getSrc(0);
Value *v1 = i->getSrc(1);
// XXX: these probably shouldn't be immediates in the first place ...
if (v0->asImm())
v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
if (v1->asImm())
v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
bld.setPosition(i, true);
bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
bld.setPosition(i, false);
i->op = OP_SET;
i->setFlagsDef(0, pred);
i->dType = TYPE_U8;
i->setSrc(0, i->getSrc(2));
i->setSrc(2, NULL);
i->setSrc(1, bld.loadImm(NULL, 0));
return true;
}
bool
NV50LoweringPreSSA::handleSELP(Instruction *i)
{
Value *src0 = bld.getSSA();
Value *src1 = bld.getSSA();
Value *v0 = i->getSrc(0);
Value *v1 = i->getSrc(1);
if (v0->asImm())
v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
if (v1->asImm())
v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
delete_Instruction(prog, i);
return true;
}
bool
NV50LoweringPreSSA::handleWRSV(Instruction *i)
{
Symbol *sym = i->getSrc(0)->asSym();
// these are all shader outputs, $sreg are not writeable
uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
if (addr >= 0x400)
return false;
sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
bld.getBB()->remove(i);
return true;
}
bool
NV50LoweringPreSSA::handleCALL(Instruction *i)
{
if (prog->getType() == Program::TYPE_COMPUTE) {
// Add implicit "thread id" argument in $r0 to the function
i->setSrc(i->srcCount(), tid);
}
return true;
}
bool
NV50LoweringPreSSA::handlePRECONT(Instruction *i)
{
delete_Instruction(prog, i);
return true;
}
bool
NV50LoweringPreSSA::handleCONT(Instruction *i)
{
i->op = OP_BRA;
return true;
}
bool
NV50LoweringPreSSA::handleRDSV(Instruction *i)
{
Symbol *sym = i->getSrc(0)->asSym();
uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
Value *def = i->getDef(0);
SVSemantic sv = sym->reg.data.sv.sv;
int idx = sym->reg.data.sv.index;
if (addr >= 0x400) // mov $sreg
return true;
switch (sv) {
case SV_POSITION:
assert(prog->getType() == Program::TYPE_FRAGMENT);
bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
break;
case SV_FACE:
bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
if (i->dType == TYPE_F32) {
bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
}
break;
case SV_NCTAID:
case SV_CTAID:
case SV_NTID:
if ((sv == SV_NCTAID && idx >= 2) ||
(sv == SV_NTID && idx >= 3)) {
bld.mkMov(def, bld.mkImm(1));
} else if (sv == SV_CTAID && idx >= 2) {
bld.mkMov(def, bld.mkImm(0));
} else {
Value *x = bld.getSSA(2);
bld.mkOp1(OP_LOAD, TYPE_U16, x,
bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
}
break;
case SV_TID:
if (idx == 0) {
bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
} else if (idx == 1) {
bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
} else if (idx == 2) {
bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
} else {
bld.mkMov(def, bld.mkImm(0));
}
break;
default:
bld.mkFetch(i->getDef(0), i->dType,
FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
break;
}
bld.getBB()->remove(i);
return true;
}
bool
NV50LoweringPreSSA::handleDIV(Instruction *i)
{
if (!isFloatType(i->dType))
return true;
bld.setPosition(i, false);
Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
i->op = OP_MUL;
i->setSrc(1, rcp->getDef(0));
return true;
}
bool
NV50LoweringPreSSA::handleSQRT(Instruction *i)
{
Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
bld.getSSA(), i->getSrc(0));
i->op = OP_MUL;
i->setSrc(1, rsq->getDef(0));
return true;
}
bool
NV50LoweringPreSSA::handlePOW(Instruction *i)
{
LValue *val = bld.getScratch();
bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
i->op = OP_EX2;
i->setSrc(0, val);
i->setSrc(1, NULL);
return true;
}
bool
NV50LoweringPreSSA::handleEXPORT(Instruction *i)
{
if (prog->getType() == Program::TYPE_FRAGMENT) {
if (i->getIndirect(0, 0)) {
// TODO: redirect to l[] here, load to GPRs at exit
return false;
} else {
int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
i->op = OP_MOV;
i->subOp = NV50_IR_SUBOP_MOV_FINAL;
i->src(0).set(i->src(1));
i->setSrc(1, NULL);
i->setDef(0, new_LValue(func, FILE_GPR));
i->getDef(0)->reg.data.id = id;
prog->maxGPR = MAX2(prog->maxGPR, id);
}
}
return true;
}
// Set flags according to predicate and make the instruction read $cX.
void
NV50LoweringPreSSA::checkPredicate(Instruction *insn)
{
Value *pred = insn->getPredicate();
Value *cdst;
if (!pred || pred->reg.file == FILE_FLAGS)
return;
cdst = bld.getSSA(1, FILE_FLAGS);
bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred);
insn->setPredicate(insn->cc, cdst);
}
//
// - add quadop dance for texturing
// - put FP outputs in GPRs
// - convert instruction sequences
//
bool
NV50LoweringPreSSA::visit(Instruction *i)
{
bld.setPosition(i, false);
if (i->cc != CC_ALWAYS)
checkPredicate(i);
switch (i->op) {
case OP_TEX:
case OP_TXF:
case OP_TXG:
return handleTEX(i->asTex());
case OP_TXB:
return handleTXB(i->asTex());
case OP_TXL:
return handleTXL(i->asTex());
case OP_TXD:
return handleTXD(i->asTex());
case OP_EX2:
bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
i->setSrc(0, i->getDef(0));
break;
case OP_SET:
return handleSET(i);
case OP_SLCT:
return handleSLCT(i->asCmp());
case OP_SELP:
return handleSELP(i);
case OP_POW:
return handlePOW(i);
case OP_DIV:
return handleDIV(i);
case OP_SQRT:
return handleSQRT(i);
case OP_EXPORT:
return handleEXPORT(i);
case OP_RDSV:
return handleRDSV(i);
case OP_WRSV:
return handleWRSV(i);
case OP_CALL:
return handleCALL(i);
case OP_PRECONT:
return handlePRECONT(i);
case OP_CONT:
return handleCONT(i);
default:
break;
}
return true;
}
bool
TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
{
bool ret = false;
if (stage == CG_STAGE_PRE_SSA) {
NV50LoweringPreSSA pass(prog);
ret = pass.run(prog, false, true);
} else
if (stage == CG_STAGE_SSA) {
if (!prog->targetPriv)
prog->targetPriv = new std::list<Instruction *>();
NV50LegalizeSSA pass(prog);
ret = pass.run(prog, false, true);
} else
if (stage == CG_STAGE_POST_RA) {
NV50LegalizePostRA pass;
ret = pass.run(prog, false, true);
if (prog->targetPriv)
delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
}
return ret;
}
} // namespace nv50_ir