src/gallium/drivers/radeon/R600ISelLowering.cpp - platform/external/mesa3d - Git at Google

 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // Most of the DAG lowering is handled in AMDGPUISelLowering.cpp.  This file
 // is mostly EmitInstrWithCustomInserter().
 //
 //===----------------------------------------------------------------------===//

 #include "R600ISelLowering.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"

 using namespace llvm;

 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM),
     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo()))
 {
   setOperationAction(ISD::MUL, MVT::i64, Expand);
   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
   computeRegisterProperties();

   setOperationAction(ISD::BR_CC, MVT::i32, Custom);

   setOperationAction(ISD::FSUB, MVT::f32, Expand);

   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

   setOperationAction(ISD::ROTL, MVT::i32, Custom);

   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);

   setOperationAction(ISD::SETCC, MVT::i32, Custom);

   setSchedulingPreference(Sched::VLIW);
 }

 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const
 {
   MachineFunction * MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = *MI;

   switch (MI->getOpcode()) {
   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::CLAMP_R600:
     {
       MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV))
                .addOperand(MI->getOperand(0))
                .addOperand(MI->getOperand(1))
                .addImm(0) // Flags
                .addReg(AMDGPU::PRED_SEL_OFF);
       TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
       break;
     }
   case AMDGPU::FABS_R600:
     {
       MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV))
                .addOperand(MI->getOperand(0))
                .addOperand(MI->getOperand(1))
                .addImm(0) // Flags
                .addReg(AMDGPU::PRED_SEL_OFF);
       TII->addFlag(NewMI, 1, MO_FLAG_ABS);
       break;
     }

   case AMDGPU::FNEG_R600:
     {
       MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV))
                 .addOperand(MI->getOperand(0))
                 .addOperand(MI->getOperand(1))
                 .addImm(0) // Flags
                 .addReg(AMDGPU::PRED_SEL_OFF);
       TII->addFlag(NewMI, 1, MO_FLAG_NEG);
     break;
     }

   case AMDGPU::R600_LOAD_CONST:
     {
       int64_t RegIndex = MI->getOperand(1).getImm();
       unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
                   .addOperand(MI->getOperand(0))
                   .addReg(ConstantReg);
       break;
     }

   case AMDGPU::MASK_WRITE:
     {
       unsigned maskedRegister = MI->getOperand(0).getReg();
       assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
       MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
       TII->addFlag(defInstr, 0, MO_FLAG_MASK);
       // Return early so the instruction is not erased
       return BB;
     }

   case AMDGPU::RAT_WRITE_CACHELESS_eg:
     {
       // Convert to DWORD address
       unsigned NewAddr = MRI.createVirtualRegister(
                                              &AMDGPU::R600_TReg32_XRegClass);
       unsigned ShiftValue = MRI.createVirtualRegister(
                                               &AMDGPU::R600_TReg32RegClass);
       unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;

       // XXX In theory, we should be able to pass ShiftValue directly to
       // the LSHR_eg instruction as an inline literal, but I tried doing it
       // this way and it didn't produce the correct results.
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV_IMM_I32),
               ShiftValue)
               .addReg(AMDGPU::ALU_LITERAL_X)
               .addReg(AMDGPU::PRED_SEL_OFF)
               .addImm(2);
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::LSHR_eg), NewAddr)
               .addOperand(MI->getOperand(1))
               .addReg(ShiftValue)
               .addReg(AMDGPU::PRED_SEL_OFF);
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
               .addOperand(MI->getOperand(0))
               .addReg(NewAddr)
               .addImm(EOP); // Set End of program bit
       break;
     }

   case AMDGPU::RESERVE_REG:
     {
       R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
       int64_t ReservedIndex = MI->getOperand(0).getImm();
       unsigned ReservedReg =
                           AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
       MFI->ReservedRegs.push_back(ReservedReg);
       break;
     }

   case AMDGPU::TXD:
     {
       unsigned t0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
       unsigned t1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);

       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), t0)
               .addOperand(MI->getOperand(3))
               .addOperand(MI->getOperand(4))
               .addOperand(MI->getOperand(5));
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), t1)
               .addOperand(MI->getOperand(2))
               .addOperand(MI->getOperand(4))
               .addOperand(MI->getOperand(5));
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
               .addOperand(MI->getOperand(0))
               .addOperand(MI->getOperand(1))
               .addOperand(MI->getOperand(4))
               .addOperand(MI->getOperand(5))
               .addReg(t0, RegState::Implicit)
               .addReg(t1, RegState::Implicit);
       break;
     }
   case AMDGPU::TXD_SHADOW:
     {
       unsigned t0 = MRI.createVirtualRegister(AMDGPU::R600_Reg128RegisterClass);
       unsigned t1 = MRI.createVirtualRegister(AMDGPU::R600_Reg128RegisterClass);

       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), t0)
               .addOperand(MI->getOperand(3))
               .addOperand(MI->getOperand(4))
               .addOperand(MI->getOperand(5));
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), t1)
               .addOperand(MI->getOperand(2))
               .addOperand(MI->getOperand(4))
               .addOperand(MI->getOperand(5));
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
               .addOperand(MI->getOperand(0))
               .addOperand(MI->getOperand(1))
               .addOperand(MI->getOperand(4))
               .addOperand(MI->getOperand(5))
               .addReg(t0, RegState::Implicit)
               .addReg(t1, RegState::Implicit);
       break;
     }
   case AMDGPU::BRANCH:
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
               .addOperand(MI->getOperand(0))
               .addReg(0);
       break;
   case AMDGPU::BRANCH_COND_f32:
     {
       MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X))
                 .addReg(AMDGPU::PREDICATE_BIT)
                 .addOperand(MI->getOperand(1))
                 .addImm(OPCODE_IS_ZERO)
                 .addImm(0); // Flags
       TII->addFlag(NewMI, 1, MO_FLAG_PUSH);
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
               .addOperand(MI->getOperand(0))
               .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
       break;
     }
   case AMDGPU::BRANCH_COND_i32:
     {
       MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X))
               .addReg(AMDGPU::PREDICATE_BIT)
               .addOperand(MI->getOperand(1))
               .addImm(OPCODE_IS_ZERO_INT)
               .addImm(0); // Flags
       TII->addFlag(NewMI, 1, MO_FLAG_PUSH);
       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
              .addOperand(MI->getOperand(0))
               .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
       break;
     }
   }

   MI->eraseFromParent();
   return BB;
 }

 //===----------------------------------------------------------------------===//
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//

 using namespace llvm::Intrinsic;
 using namespace llvm::AMDGPUIntrinsic;

 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
   case ISD::ROTL: return LowerROTL(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SETCC: return LowerSETCC(Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
     case AMDGPUIntrinsic::AMDGPU_store_output: {
       MachineFunction &MF = DAG.getMachineFunction();
       MachineRegisterInfo &MRI = MF.getRegInfo();
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
       if (!MRI.isLiveOut(Reg)) {
         MRI.addLiveOut(Reg);
       }
       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
     }
     // default for switch(IntrinsicID)
     default: break;
     }
     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     EVT VT = Op.getValueType();
     DebugLoc DL = Op.getDebugLoc();
     switch(IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case AMDGPUIntrinsic::R600_load_input: {
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
     }

     case r600_read_ngroups_x:
       return LowerImplicitParameter(DAG, VT, DL, 0);
     case r600_read_ngroups_y:
       return LowerImplicitParameter(DAG, VT, DL, 1);
     case r600_read_ngroups_z:
       return LowerImplicitParameter(DAG, VT, DL, 2);
     case r600_read_global_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 3);
     case r600_read_global_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 4);
     case r600_read_global_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 5);
     case r600_read_local_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 6);
     case r600_read_local_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 7);
     case r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);

     case r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_X, VT);
     case r600_read_tgid_y:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_Y, VT);
     case r600_read_tgid_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_Z, VT);
     case r600_read_tidig_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_X, VT);
     case r600_read_tidig_y:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Y, VT);
     case r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Z, VT);
     }
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     break;
   }
   } // end switch(Op.getOpcode())
   return SDValue();
 }

 SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const
 {
   SDValue Chain = Op.getOperand(0);
   SDValue CC = Op.getOperand(1);
   SDValue LHS   = Op.getOperand(2);
   SDValue RHS   = Op.getOperand(3);
   SDValue JumpT  = Op.getOperand(4);
   SDValue CmpValue;
   SDValue Result;
   CmpValue = DAG.getNode(
       ISD::SELECT_CC,
       Op.getDebugLoc(),
       MVT::i32,
       LHS, RHS,
       DAG.getConstant(-1, MVT::i32),
       DAG.getConstant(0, MVT::i32),
       CC);
   Result = DAG.getNode(
       AMDGPUISD::BRANCH_COND,
       CmpValue.getDebugLoc(),
       MVT::Other, Chain,
       JumpT, CmpValue);
   return Result;
 }

 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    DebugLoc DL,
                                                    unsigned DwordOffset) const
 {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                       AMDGPUAS::PARAM_I_ADDRESS);

   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));

   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
                      false, false, false, 0);
 }

 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const
 {
   DebugLoc DL = Op.getDebugLoc();
   EVT VT = Op.getValueType();

   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
                      Op.getOperand(0),
                      Op.getOperand(0),
                      DAG.getNode(ISD::SUB, DL, VT,
                                  DAG.getConstant(32, MVT::i32),
                                  Op.getOperand(1)));
 }

 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
   DebugLoc DL = Op.getDebugLoc();
   EVT VT = Op.getValueType();

   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue True = Op.getOperand(2);
   SDValue False = Op.getOperand(3);
   SDValue CC = Op.getOperand(4);
   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDValue Temp;

   // LHS and RHS are guaranteed to be the same value type
   EVT CompareVT = LHS.getValueType();

   // We need all the operands of SELECT_CC to have the same value type, so if
   // necessary we need to convert LHS and RHS to be the same type True and
   // False.  True and False are guaranteed to have the same type as this
   // SELECT_CC node.

   if (CompareVT !=  VT) {
     ISD::NodeType ConversionOp = ISD::DELETED_NODE;
     if (VT == MVT::f32 && CompareVT == MVT::i32) {
       if (isUnsignedIntSetCC(CCOpcode)) {
         ConversionOp = ISD::UINT_TO_FP;
       } else {
         ConversionOp = ISD::SINT_TO_FP;
       }
     } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
       ConversionOp = ISD::FP_TO_SINT;
     } else {
       // I don't think there will be any other type pairings.
       assert(!"Unhandled operand type parings in SELECT_CC");
     }
     // XXX Check the value of LHS and RHS and avoid creating sequences like
     // (FTOI (ITOF))
     LHS = DAG.getNode(ConversionOp, DL, VT, LHS);
     RHS = DAG.getNode(ConversionOp, DL, VT, RHS);
   }

   // If True is a hardware TRUE value and False is a hardware FALSE value or
   // vice-versa we can handle this with a native instruction (SET* instructions).
   if ((isHWTrueValue(True) && isHWFalseValue(False))) {
     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
   }

   // XXX If True is a hardware TRUE value and False is a hardware FALSE value,
   // we can handle this with a native instruction, but we need to swap true
   // and false and change the conditional.
   if (isHWTrueValue(False) && isHWFalseValue(True)) {
   }

   // XXX Check if we can lower this to a SELECT or if it is supported by a native
   // operation. (The code below does this but we don't have the Instruction
   // selection patterns to do this yet.
 #if 0
   if (isZero(LHS) || isZero(RHS)) {
     SDValue Cond = (isZero(LHS) ? RHS : LHS);
     bool SwapTF = false;
     switch (CCOpcode) {
     case ISD::SETOEQ:
     case ISD::SETUEQ:
     case ISD::SETEQ:
       SwapTF = true;
       // Fall through
     case ISD::SETONE:
     case ISD::SETUNE:
     case ISD::SETNE:
       // We can lower to select
       if (SwapTF) {
         Temp = True;
         True = False;
         False = Temp;
       }
       // CNDE
       return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
     default:
       // Supported by a native operation (CNDGE, CNDGT)
       return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
     }
   }
 #endif

   // If we make it this for it means we have no native instructions to handle
   // this SELECT_CC, so we must lower it.
   SDValue HWTrue, HWFalse;

   if (VT == MVT::f32) {
     HWTrue = DAG.getConstantFP(1.0f, VT);
     HWFalse = DAG.getConstantFP(0.0f, VT);
   } else if (VT == MVT::i32) {
     HWTrue = DAG.getConstant(-1, VT);
     HWFalse = DAG.getConstant(0, VT);
   }
   else {
     assert(!"Unhandled value type in LowerSELECT_CC");
   }

   // Lower this unsupported SELECT_CC into a combination of two supported
   // SELECT_CC operations.
   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, HWTrue, HWFalse, CC);

   // Convert floating point condition to i1
   if (VT == MVT::f32) {
     Cond = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32,
                        DAG.getNode(ISD::FNEG, DL, VT, Cond));
   }

   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 }

 SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const
 {
   SDValue Cond;
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue CC  = Op.getOperand(2);
   DebugLoc DL = Op.getDebugLoc();
   assert(Op.getValueType() == MVT::i32);
   Cond = DAG.getNode(
       ISD::SELECT_CC,
       Op.getDebugLoc(),
       MVT::i32,
       LHS, RHS,
       DAG.getConstant(-1, MVT::i32),
       DAG.getConstant(0, MVT::i32),
       CC);
   Cond = DAG.getNode(
       ISD::AND,
       DL,
       MVT::i32,
       DAG.getConstant(1, MVT::i32),
       Cond);
   return Cond;
 }
	//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Most of the DAG lowering is handled in AMDGPUISelLowering.cpp. This file
	// is mostly EmitInstrWithCustomInserter().
	//
	//===----------------------------------------------------------------------===//

	#include "R600ISelLowering.h"
	#include "R600Defines.h"
	#include "R600InstrInfo.h"
	#include "R600MachineFunctionInfo.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"

	using namespace llvm;

	R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
	AMDGPUTargetLowering(TM),
	TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo()))
	{
	setOperationAction(ISD::MUL, MVT::i64, Expand);
	addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
	addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
	addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
	addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
	computeRegisterProperties();

	setOperationAction(ISD::BR_CC, MVT::i32, Custom);

	setOperationAction(ISD::FSUB, MVT::f32, Expand);

	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	setOperationAction(ISD::ROTL, MVT::i32, Custom);

	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);

	setOperationAction(ISD::SETCC, MVT::i32, Custom);

	setSchedulingPreference(Sched::VLIW);
	}

	MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
	MachineInstr * MI, MachineBasicBlock * BB) const
	{
	MachineFunction * MF = BB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineBasicBlock::iterator I = *MI;

	switch (MI->getOpcode()) {
	default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
	case AMDGPU::CLAMP_R600:
	{
	MachineInstr *NewMI =
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV))
	.addOperand(MI->getOperand(0))
	.addOperand(MI->getOperand(1))
	.addImm(0) // Flags
	.addReg(AMDGPU::PRED_SEL_OFF);
	TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
	break;
	}
	case AMDGPU::FABS_R600:
	{
	MachineInstr *NewMI =
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV))
	.addOperand(MI->getOperand(0))
	.addOperand(MI->getOperand(1))
	.addImm(0) // Flags
	.addReg(AMDGPU::PRED_SEL_OFF);
	TII->addFlag(NewMI, 1, MO_FLAG_ABS);
	break;
	}

	case AMDGPU::FNEG_R600:
	{
	MachineInstr *NewMI =
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV))
	.addOperand(MI->getOperand(0))
	.addOperand(MI->getOperand(1))
	.addImm(0) // Flags
	.addReg(AMDGPU::PRED_SEL_OFF);
	TII->addFlag(NewMI, 1, MO_FLAG_NEG);
	break;
	}

	case AMDGPU::R600_LOAD_CONST:
	{
	int64_t RegIndex = MI->getOperand(1).getImm();
	unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
	.addOperand(MI->getOperand(0))
	.addReg(ConstantReg);
	break;
	}

	case AMDGPU::MASK_WRITE:
	{
	unsigned maskedRegister = MI->getOperand(0).getReg();
	assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
	MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
	TII->addFlag(defInstr, 0, MO_FLAG_MASK);
	// Return early so the instruction is not erased
	return BB;
	}

	case AMDGPU::RAT_WRITE_CACHELESS_eg:
	{
	// Convert to DWORD address
	unsigned NewAddr = MRI.createVirtualRegister(
	&AMDGPU::R600_TReg32_XRegClass);
	unsigned ShiftValue = MRI.createVirtualRegister(
	&AMDGPU::R600_TReg32RegClass);
	unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;

	// XXX In theory, we should be able to pass ShiftValue directly to
	// the LSHR_eg instruction as an inline literal, but I tried doing it
	// this way and it didn't produce the correct results.
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::MOV_IMM_I32),
	ShiftValue)
	.addReg(AMDGPU::ALU_LITERAL_X)
	.addReg(AMDGPU::PRED_SEL_OFF)
	.addImm(2);
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::LSHR_eg), NewAddr)
	.addOperand(MI->getOperand(1))
	.addReg(ShiftValue)
	.addReg(AMDGPU::PRED_SEL_OFF);
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
	.addOperand(MI->getOperand(0))
	.addReg(NewAddr)
	.addImm(EOP); // Set End of program bit
	break;
	}

	case AMDGPU::RESERVE_REG:
	{
	R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
	int64_t ReservedIndex = MI->getOperand(0).getImm();
	unsigned ReservedReg =
	AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
	MFI->ReservedRegs.push_back(ReservedReg);
	break;
	}

	case AMDGPU::TXD:
	{
	unsigned t0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
	unsigned t1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);

	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), t0)
	.addOperand(MI->getOperand(3))
	.addOperand(MI->getOperand(4))
	.addOperand(MI->getOperand(5));
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), t1)
	.addOperand(MI->getOperand(2))
	.addOperand(MI->getOperand(4))
	.addOperand(MI->getOperand(5));
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
	.addOperand(MI->getOperand(0))
	.addOperand(MI->getOperand(1))
	.addOperand(MI->getOperand(4))
	.addOperand(MI->getOperand(5))
	.addReg(t0, RegState::Implicit)
	.addReg(t1, RegState::Implicit);
	break;
	}
	case AMDGPU::TXD_SHADOW:
	{
	unsigned t0 = MRI.createVirtualRegister(AMDGPU::R600_Reg128RegisterClass);
	unsigned t1 = MRI.createVirtualRegister(AMDGPU::R600_Reg128RegisterClass);

	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), t0)
	.addOperand(MI->getOperand(3))
	.addOperand(MI->getOperand(4))
	.addOperand(MI->getOperand(5));
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), t1)
	.addOperand(MI->getOperand(2))
	.addOperand(MI->getOperand(4))
	.addOperand(MI->getOperand(5));
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
	.addOperand(MI->getOperand(0))
	.addOperand(MI->getOperand(1))
	.addOperand(MI->getOperand(4))
	.addOperand(MI->getOperand(5))
	.addReg(t0, RegState::Implicit)
	.addReg(t1, RegState::Implicit);
	break;
	}
	case AMDGPU::BRANCH:
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
	.addOperand(MI->getOperand(0))
	.addReg(0);
	break;
	case AMDGPU::BRANCH_COND_f32:
	{
	MachineInstr *NewMI =
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X))
	.addReg(AMDGPU::PREDICATE_BIT)
	.addOperand(MI->getOperand(1))
	.addImm(OPCODE_IS_ZERO)
	.addImm(0); // Flags
	TII->addFlag(NewMI, 1, MO_FLAG_PUSH);
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
	.addOperand(MI->getOperand(0))
	.addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
	break;
	}
	case AMDGPU::BRANCH_COND_i32:
	{
	MachineInstr *NewMI =
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X))
	.addReg(AMDGPU::PREDICATE_BIT)
	.addOperand(MI->getOperand(1))
	.addImm(OPCODE_IS_ZERO_INT)
	.addImm(0); // Flags
	TII->addFlag(NewMI, 1, MO_FLAG_PUSH);
	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
	.addOperand(MI->getOperand(0))
	.addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
	break;
	}
	}

	MI->eraseFromParent();
	return BB;
	}

	//===----------------------------------------------------------------------===//
	// Custom DAG Lowering Operations
	//===----------------------------------------------------------------------===//

	using namespace llvm::Intrinsic;
	using namespace llvm::AMDGPUIntrinsic;

	SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
	{
	switch (Op.getOpcode()) {
	default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
	case ISD::BR_CC: return LowerBR_CC(Op, DAG);
	case ISD::ROTL: return LowerROTL(Op, DAG);
	case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::INTRINSIC_VOID: {
	SDValue Chain = Op.getOperand(0);
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	switch (IntrinsicID) {
	case AMDGPUIntrinsic::AMDGPU_store_output: {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineRegisterInfo &MRI = MF.getRegInfo();
	int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
	if (!MRI.isLiveOut(Reg)) {
	MRI.addLiveOut(Reg);
	}
	return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
	}
	// default for switch(IntrinsicID)
	default: break;
	}
	// break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IntrinsicID =
	cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	EVT VT = Op.getValueType();
	DebugLoc DL = Op.getDebugLoc();
	switch(IntrinsicID) {
	default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
	case AMDGPUIntrinsic::R600_load_input: {
	int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
	return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
	}

	case r600_read_ngroups_x:
	return LowerImplicitParameter(DAG, VT, DL, 0);
	case r600_read_ngroups_y:
	return LowerImplicitParameter(DAG, VT, DL, 1);
	case r600_read_ngroups_z:
	return LowerImplicitParameter(DAG, VT, DL, 2);
	case r600_read_global_size_x:
	return LowerImplicitParameter(DAG, VT, DL, 3);
	case r600_read_global_size_y:
	return LowerImplicitParameter(DAG, VT, DL, 4);
	case r600_read_global_size_z:
	return LowerImplicitParameter(DAG, VT, DL, 5);
	case r600_read_local_size_x:
	return LowerImplicitParameter(DAG, VT, DL, 6);
	case r600_read_local_size_y:
	return LowerImplicitParameter(DAG, VT, DL, 7);
	case r600_read_local_size_z:
	return LowerImplicitParameter(DAG, VT, DL, 8);

	case r600_read_tgid_x:
	return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
	AMDGPU::T1_X, VT);
	case r600_read_tgid_y:
	return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
	AMDGPU::T1_Y, VT);
	case r600_read_tgid_z:
	return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
	AMDGPU::T1_Z, VT);
	case r600_read_tidig_x:
	return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
	AMDGPU::T0_X, VT);
	case r600_read_tidig_y:
	return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
	AMDGPU::T0_Y, VT);
	case r600_read_tidig_z:
	return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
	AMDGPU::T0_Z, VT);
	}
	// break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
	break;
	}
	} // end switch(Op.getOpcode())
	return SDValue();
	}

	SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const
	{
	SDValue Chain = Op.getOperand(0);
	SDValue CC = Op.getOperand(1);
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue JumpT = Op.getOperand(4);
	SDValue CmpValue;
	SDValue Result;
	CmpValue = DAG.getNode(
	ISD::SELECT_CC,
	Op.getDebugLoc(),
	MVT::i32,
	LHS, RHS,
	DAG.getConstant(-1, MVT::i32),
	DAG.getConstant(0, MVT::i32),
	CC);
	Result = DAG.getNode(
	AMDGPUISD::BRANCH_COND,
	CmpValue.getDebugLoc(),
	MVT::Other, Chain,
	JumpT, CmpValue);
	return Result;
	}

	SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
	DebugLoc DL,
	unsigned DwordOffset) const
	{
	unsigned ByteOffset = DwordOffset * 4;
	PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
	AMDGPUAS::PARAM_I_ADDRESS);

	// We shouldn't be using an offset wider than 16-bits for implicit parameters.
	assert(isInt<16>(ByteOffset));

	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getConstant(ByteOffset, MVT::i32), // PTR
	MachinePointerInfo(ConstantPointerNull::get(PtrType)),
	false, false, false, 0);
	}

	SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const
	{
	DebugLoc DL = Op.getDebugLoc();
	EVT VT = Op.getValueType();

	return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
	Op.getOperand(0),
	Op.getOperand(0),
	DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(32, MVT::i32),
	Op.getOperand(1)));
	}

	SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
	{
	DebugLoc DL = Op.getDebugLoc();
	EVT VT = Op.getValueType();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue True = Op.getOperand(2);
	SDValue False = Op.getOperand(3);
	SDValue CC = Op.getOperand(4);
	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
	SDValue Temp;

	// LHS and RHS are guaranteed to be the same value type
	EVT CompareVT = LHS.getValueType();

	// We need all the operands of SELECT_CC to have the same value type, so if
	// necessary we need to convert LHS and RHS to be the same type True and
	// False. True and False are guaranteed to have the same type as this
	// SELECT_CC node.

	if (CompareVT != VT) {
	ISD::NodeType ConversionOp = ISD::DELETED_NODE;
	if (VT == MVT::f32 && CompareVT == MVT::i32) {
	if (isUnsignedIntSetCC(CCOpcode)) {
	ConversionOp = ISD::UINT_TO_FP;
	} else {
	ConversionOp = ISD::SINT_TO_FP;
	}
	} else if (VT == MVT::i32 && CompareVT == MVT::f32) {
	ConversionOp = ISD::FP_TO_SINT;
	} else {
	// I don't think there will be any other type pairings.
	assert(!"Unhandled operand type parings in SELECT_CC");
	}
	// XXX Check the value of LHS and RHS and avoid creating sequences like
	// (FTOI (ITOF))
	LHS = DAG.getNode(ConversionOp, DL, VT, LHS);
	RHS = DAG.getNode(ConversionOp, DL, VT, RHS);
	}

	// If True is a hardware TRUE value and False is a hardware FALSE value or
	// vice-versa we can handle this with a native instruction (SET* instructions).
	if ((isHWTrueValue(True) && isHWFalseValue(False))) {
	return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
	}

	// XXX If True is a hardware TRUE value and False is a hardware FALSE value,
	// we can handle this with a native instruction, but we need to swap true
	// and false and change the conditional.
	if (isHWTrueValue(False) && isHWFalseValue(True)) {
	}

	// XXX Check if we can lower this to a SELECT or if it is supported by a native
	// operation. (The code below does this but we don't have the Instruction
	// selection patterns to do this yet.
	#if 0
	if (isZero(LHS) \|\| isZero(RHS)) {
	SDValue Cond = (isZero(LHS) ? RHS : LHS);
	bool SwapTF = false;
	switch (CCOpcode) {
	case ISD::SETOEQ:
	case ISD::SETUEQ:
	case ISD::SETEQ:
	SwapTF = true;
	// Fall through
	case ISD::SETONE:
	case ISD::SETUNE:
	case ISD::SETNE:
	// We can lower to select
	if (SwapTF) {
	Temp = True;
	True = False;
	False = Temp;
	}
	// CNDE
	return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
	default:
	// Supported by a native operation (CNDGE, CNDGT)
	return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
	}
	}
	#endif

	// If we make it this for it means we have no native instructions to handle
	// this SELECT_CC, so we must lower it.
	SDValue HWTrue, HWFalse;

	if (VT == MVT::f32) {
	HWTrue = DAG.getConstantFP(1.0f, VT);
	HWFalse = DAG.getConstantFP(0.0f, VT);
	} else if (VT == MVT::i32) {
	HWTrue = DAG.getConstant(-1, VT);
	HWFalse = DAG.getConstant(0, VT);
	}
	else {
	assert(!"Unhandled value type in LowerSELECT_CC");
	}

	// Lower this unsupported SELECT_CC into a combination of two supported
	// SELECT_CC operations.
	SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, HWTrue, HWFalse, CC);

	// Convert floating point condition to i1
	if (VT == MVT::f32) {
	Cond = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32,
	DAG.getNode(ISD::FNEG, DL, VT, Cond));
	}

	return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
	}

	SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const
	{
	SDValue Cond;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	DebugLoc DL = Op.getDebugLoc();
	assert(Op.getValueType() == MVT::i32);
	Cond = DAG.getNode(
	ISD::SELECT_CC,
	Op.getDebugLoc(),
	MVT::i32,
	LHS, RHS,
	DAG.getConstant(-1, MVT::i32),
	DAG.getConstant(0, MVT::i32),
	CC);
	Cond = DAG.getNode(
	ISD::AND,
	DL,
	MVT::i32,
	DAG.getConstant(1, MVT::i32),
	Cond);
	return Cond;
	}