| /**************************************************************************** |
| * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| * @file lower_x86.cpp |
| * |
| * @brief llvm pass to lower meta code to x86 |
| * |
| * Notes: |
| * |
| ******************************************************************************/ |
| |
| #include "jit_pch.hpp" |
| #include "passes.h" |
| #include "JitManager.h" |
| |
| #include "common/simdlib.hpp" |
| |
| #include <unordered_map> |
| |
| extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t); |
| |
| namespace llvm |
| { |
| // foward declare the initializer |
| void initializeLowerX86Pass(PassRegistry&); |
| } // namespace llvm |
| |
| namespace SwrJit |
| { |
| using namespace llvm; |
| |
| enum TargetArch |
| { |
| AVX = 0, |
| AVX2 = 1, |
| AVX512 = 2 |
| }; |
| |
| enum TargetWidth |
| { |
| W256 = 0, |
| W512 = 1, |
| NUM_WIDTHS = 2 |
| }; |
| |
| struct LowerX86; |
| |
| typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc; |
| |
| struct X86Intrinsic |
| { |
| IntrinsicID intrin[NUM_WIDTHS]; |
| EmuFunc emuFunc; |
| }; |
| |
| // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the |
| // previous behavior of mapping directly to avx/avx2 intrinsics. |
| static std::map<std::string, IntrinsicID> intrinsicMap = { |
| {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, |
| {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, |
| {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, |
| {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, |
| {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256}, |
| {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d}, |
| {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32}, |
| {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}, |
| }; |
| |
| // Forward decls |
| Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); |
| Instruction* |
| VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); |
| Instruction* |
| VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); |
| Instruction* |
| VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); |
| Instruction* |
| VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); |
| Instruction* |
| VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); |
| Instruction* |
| VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); |
| |
| Instruction* DOUBLE_EMU(LowerX86* pThis, |
| TargetArch arch, |
| TargetWidth width, |
| CallInst* pCallInst, |
| Intrinsic::ID intrin); |
| |
| static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1; |
| |
| // clang-format off |
| static std::map<std::string, X86Intrinsic> intrinsicMap2[] = { |
| // 256 wide 512 wide |
| { |
| // AVX |
| {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, |
| {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, |
| {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, |
| {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, |
| {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, |
| {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, |
| {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, |
| }, |
| { |
| // AVX2 |
| {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, |
| {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, |
| {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, |
| {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, |
| {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}}, |
| {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, |
| {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, |
| }, |
| { |
| // AVX512 |
| {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, |
| #if LLVM_VERSION_MAJOR < 7 |
| {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, |
| {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, |
| #else |
| {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, |
| {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, |
| #endif |
| {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, |
| {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, |
| #if LLVM_VERSION_MAJOR < 7 |
| {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}}, |
| #else |
| {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}}, |
| #endif |
| {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, |
| {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}, |
| }}; |
| // clang-format on |
| |
| struct LowerX86 : public FunctionPass |
| { |
| LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b) |
| { |
| initializeLowerX86Pass(*PassRegistry::getPassRegistry()); |
| |
| // Determine target arch |
| if (JM()->mArch.AVX512F()) |
| { |
| mTarget = AVX512; |
| } |
| else if (JM()->mArch.AVX2()) |
| { |
| mTarget = AVX2; |
| } |
| else if (JM()->mArch.AVX()) |
| { |
| mTarget = AVX; |
| } |
| else |
| { |
| SWR_ASSERT(false, "Unsupported AVX architecture."); |
| mTarget = AVX; |
| } |
| |
| // Setup scatter function for 256 wide |
| uint32_t curWidth = B->mVWidth; |
| B->SetTargetWidth(8); |
| std::vector<Type*> args = { |
| B->mInt8PtrTy, // pBase |
| B->mSimdInt32Ty, // vIndices |
| B->mSimdFP32Ty, // vSrc |
| B->mInt8Ty, // mask |
| B->mInt32Ty // scale |
| }; |
| |
| FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false); |
| mPfnScatter256 = cast<Function>( |
| #if LLVM_VERSION_MAJOR >= 9 |
| B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee()); |
| #else |
| B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy)); |
| #endif |
| if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr) |
| { |
| sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256); |
| } |
| |
| B->SetTargetWidth(curWidth); |
| } |
| |
| // Try to decipher the vector type of the instruction. This does not work properly |
| // across all intrinsics, and will have to be rethought. Probably need something |
| // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed |
| // intrinsic. |
| void GetRequestedWidthAndType(CallInst* pCallInst, |
| const StringRef intrinName, |
| TargetWidth* pWidth, |
| Type** pTy) |
| { |
| assert(pCallInst); |
| Type* pVecTy = pCallInst->getType(); |
| |
| // Check for intrinsic specific types |
| // VCVTPD2PS type comes from src, not dst |
| if (intrinName.equals("meta.intrinsic.VCVTPD2PS")) |
| { |
| Value* pOp = pCallInst->getOperand(0); |
| assert(pOp); |
| pVecTy = pOp->getType(); |
| } |
| |
| if (!pVecTy->isVectorTy()) |
| { |
| for (auto& op : pCallInst->arg_operands()) |
| { |
| if (op.get()->getType()->isVectorTy()) |
| { |
| pVecTy = op.get()->getType(); |
| break; |
| } |
| } |
| } |
| SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size"); |
| |
| uint32_t width = cast<VectorType>(pVecTy)->getBitWidth(); |
| switch (width) |
| { |
| case 256: |
| *pWidth = W256; |
| break; |
| case 512: |
| *pWidth = W512; |
| break; |
| default: |
| SWR_ASSERT(false, "Unhandled vector width %d", width); |
| *pWidth = W256; |
| } |
| |
| *pTy = pVecTy->getScalarType(); |
| } |
| |
| Value* GetZeroVec(TargetWidth width, Type* pTy) |
| { |
| uint32_t numElem = 0; |
| switch (width) |
| { |
| case W256: |
| numElem = 8; |
| break; |
| case W512: |
| numElem = 16; |
| break; |
| default: |
| SWR_ASSERT(false, "Unhandled vector width type %d\n", width); |
| } |
| |
| return ConstantVector::getNullValue(VectorType::get(pTy, numElem)); |
| } |
| |
| Value* GetMask(TargetWidth width) |
| { |
| Value* mask; |
| switch (width) |
| { |
| case W256: |
| mask = B->C((uint8_t)-1); |
| break; |
| case W512: |
| mask = B->C((uint16_t)-1); |
| break; |
| default: |
| SWR_ASSERT(false, "Unhandled vector width type %d\n", width); |
| } |
| return mask; |
| } |
| |
| // Convert <N x i1> mask to <N x i32> x86 mask |
| Value* VectorMask(Value* vi1Mask) |
| { |
| uint32_t numElem = vi1Mask->getType()->getVectorNumElements(); |
| return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem)); |
| } |
| |
| Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst) |
| { |
| Function* pFunc = pCallInst->getCalledFunction(); |
| assert(pFunc); |
| |
| auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName().str()]; |
| TargetWidth vecWidth; |
| Type* pElemTy; |
| GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy); |
| |
| // Check if there is a native intrinsic for this instruction |
| IntrinsicID id = intrinsic.intrin[vecWidth]; |
| if (id == DOUBLE) |
| { |
| // Double pump the next smaller SIMD intrinsic |
| SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width."); |
| Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1]; |
| SWR_ASSERT(id2 != Intrinsic::not_intrinsic, |
| "Cannot find intrinsic to double pump."); |
| return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2); |
| } |
| else if (id != Intrinsic::not_intrinsic) |
| { |
| Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id); |
| SmallVector<Value*, 8> args; |
| for (auto& arg : pCallInst->arg_operands()) |
| { |
| args.push_back(arg.get()); |
| } |
| |
| // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and |
| // full mask for now Assuming the intrinsics are consistent and place the src |
| // operand and mask last in the argument list. |
| if (mTarget == AVX512) |
| { |
| if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS")) |
| { |
| args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType())); |
| args.push_back(GetMask(W256)); |
| // for AVX512 VCVTPD2PS, we also have to add rounding mode |
| args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); |
| } |
| else |
| { |
| args.push_back(GetZeroVec(vecWidth, pElemTy)); |
| args.push_back(GetMask(vecWidth)); |
| } |
| } |
| |
| return B->CALLA(pIntrin, args); |
| } |
| else |
| { |
| // No native intrinsic, call emulation function |
| return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst); |
| } |
| |
| SWR_ASSERT(false); |
| return nullptr; |
| } |
| |
| Instruction* ProcessIntrinsic(CallInst* pCallInst) |
| { |
| Function* pFunc = pCallInst->getCalledFunction(); |
| assert(pFunc); |
| |
| // Forward to the advanced support if found |
| if (intrinsicMap2[mTarget].find(pFunc->getName().str()) != intrinsicMap2[mTarget].end()) |
| { |
| return ProcessIntrinsicAdvanced(pCallInst); |
| } |
| |
| SWR_ASSERT(intrinsicMap.find(pFunc->getName().str()) != intrinsicMap.end(), |
| "Unimplemented intrinsic %s.", |
| pFunc->getName().str().c_str()); |
| |
| Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName().str()]; |
| Function* pX86IntrinFunc = |
| Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic); |
| |
| SmallVector<Value*, 8> args; |
| for (auto& arg : pCallInst->arg_operands()) |
| { |
| args.push_back(arg.get()); |
| } |
| return B->CALLA(pX86IntrinFunc, args); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief LLVM funtion pass run method. |
| /// @param f- The function we're working on with this pass. |
| virtual bool runOnFunction(Function& F) |
| { |
| std::vector<Instruction*> toRemove; |
| std::vector<BasicBlock*> bbs; |
| |
| // Make temp copy of the basic blocks and instructions, as the intrinsic |
| // replacement code might invalidate the iterators |
| for (auto& b : F.getBasicBlockList()) |
| { |
| bbs.push_back(&b); |
| } |
| |
| for (auto* BB : bbs) |
| { |
| std::vector<Instruction*> insts; |
| for (auto& i : BB->getInstList()) |
| { |
| insts.push_back(&i); |
| } |
| |
| for (auto* I : insts) |
| { |
| if (CallInst* pCallInst = dyn_cast<CallInst>(I)) |
| { |
| Function* pFunc = pCallInst->getCalledFunction(); |
| if (pFunc) |
| { |
| if (pFunc->getName().startswith("meta.intrinsic")) |
| { |
| B->IRB()->SetInsertPoint(I); |
| Instruction* pReplace = ProcessIntrinsic(pCallInst); |
| toRemove.push_back(pCallInst); |
| if (pReplace) |
| { |
| pCallInst->replaceAllUsesWith(pReplace); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| for (auto* pInst : toRemove) |
| { |
| pInst->eraseFromParent(); |
| } |
| |
| JitManager::DumpToFile(&F, "lowerx86"); |
| |
| return true; |
| } |
| |
| virtual void getAnalysisUsage(AnalysisUsage& AU) const {} |
| |
| JitManager* JM() { return B->JM(); } |
| Builder* B; |
| TargetArch mTarget; |
| Function* mPfnScatter256; |
| |
| static char ID; ///< Needed by LLVM to generate ID for FunctionPass. |
| }; |
| |
| char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID. |
| |
| FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); } |
| |
| Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) |
| { |
| SWR_ASSERT(false, "Unimplemented intrinsic emulation."); |
| return nullptr; |
| } |
| |
| Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) |
| { |
| // Only need vperm emulation for AVX |
| SWR_ASSERT(arch == AVX); |
| |
| Builder* B = pThis->B; |
| auto v32A = pCallInst->getArgOperand(0); |
| auto vi32Index = pCallInst->getArgOperand(1); |
| |
| Value* v32Result; |
| if (isa<Constant>(vi32Index)) |
| { |
| // Can use llvm shuffle vector directly with constant shuffle indices |
| v32Result = B->VSHUFFLE(v32A, v32A, vi32Index); |
| } |
| else |
| { |
| v32Result = UndefValue::get(v32A->getType()); |
| for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l) |
| { |
| auto i32Index = B->VEXTRACT(vi32Index, B->C(l)); |
| auto val = B->VEXTRACT(v32A, i32Index); |
| v32Result = B->VINSERT(v32Result, val, B->C(l)); |
| } |
| } |
| return cast<Instruction>(v32Result); |
| } |
| |
| Instruction* |
| VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) |
| { |
| Builder* B = pThis->B; |
| auto vSrc = pCallInst->getArgOperand(0); |
| auto pBase = pCallInst->getArgOperand(1); |
| auto vi32Indices = pCallInst->getArgOperand(2); |
| auto vi1Mask = pCallInst->getArgOperand(3); |
| auto i8Scale = pCallInst->getArgOperand(4); |
| |
| pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0)); |
| uint32_t numElem = vSrc->getType()->getVectorNumElements(); |
| auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); |
| auto srcTy = vSrc->getType()->getVectorElementType(); |
| Value* v32Gather = nullptr; |
| if (arch == AVX) |
| { |
| // Full emulation for AVX |
| // Store source on stack to provide a valid address to load from inactive lanes |
| auto pStack = B->STACKSAVE(); |
| auto pTmp = B->ALLOCA(vSrc->getType()); |
| B->STORE(vSrc, pTmp); |
| |
| v32Gather = UndefValue::get(vSrc->getType()); |
| #if LLVM_VERSION_MAJOR > 10 |
| auto vi32Scale = ConstantVector::getSplat(ElementCount(numElem, false), cast<ConstantInt>(i32Scale)); |
| #else |
| auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale)); |
| #endif |
| auto vi32Offsets = B->MUL(vi32Indices, vi32Scale); |
| |
| for (uint32_t i = 0; i < numElem; ++i) |
| { |
| auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i)); |
| auto pLoadAddress = B->GEP(pBase, i32Offset); |
| pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0)); |
| auto pMaskedLoadAddress = B->GEP(pTmp, {0, i}); |
| auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i)); |
| auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress); |
| auto val = B->LOAD(pValidAddress); |
| v32Gather = B->VINSERT(v32Gather, val, B->C(i)); |
| } |
| |
| B->STACKRESTORE(pStack); |
| } |
| else if (arch == AVX2 || (arch == AVX512 && width == W256)) |
| { |
| Function* pX86IntrinFunc = nullptr; |
| if (srcTy == B->mFP32Ty) |
| { |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx2_gather_d_ps_256); |
| } |
| else if (srcTy == B->mInt32Ty) |
| { |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx2_gather_d_d_256); |
| } |
| else if (srcTy == B->mDoubleTy) |
| { |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx2_gather_d_q_256); |
| } |
| else |
| { |
| SWR_ASSERT(false, "Unsupported vector element type for gather."); |
| } |
| |
| if (width == W256) |
| { |
| auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType()); |
| v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale}); |
| } |
| else if (width == W512) |
| { |
| // Double pump 4-wide for 64bit elements |
| if (vSrc->getType()->getVectorElementType() == B->mDoubleTy) |
| { |
| auto v64Mask = pThis->VectorMask(vi1Mask); |
| v64Mask = B->S_EXT( |
| v64Mask, |
| VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements())); |
| v64Mask = B->BITCAST(v64Mask, vSrc->getType()); |
| |
| Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3})); |
| Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7})); |
| |
| Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3})); |
| Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7})); |
| |
| Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3})); |
| Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7})); |
| |
| src0 = B->BITCAST( |
| src0, |
| VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements())); |
| mask0 = B->BITCAST( |
| mask0, |
| VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements())); |
| Value* gather0 = |
| B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); |
| src1 = B->BITCAST( |
| src1, |
| VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements())); |
| mask1 = B->BITCAST( |
| mask1, |
| VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements())); |
| Value* gather1 = |
| B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); |
| |
| v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7})); |
| v32Gather = B->BITCAST(v32Gather, vSrc->getType()); |
| } |
| else |
| { |
| // Double pump 8-wide for 32bit elements |
| auto v32Mask = pThis->VectorMask(vi1Mask); |
| v32Mask = B->BITCAST(v32Mask, vSrc->getType()); |
| Value* src0 = B->EXTRACT_16(vSrc, 0); |
| Value* src1 = B->EXTRACT_16(vSrc, 1); |
| |
| Value* indices0 = B->EXTRACT_16(vi32Indices, 0); |
| Value* indices1 = B->EXTRACT_16(vi32Indices, 1); |
| |
| Value* mask0 = B->EXTRACT_16(v32Mask, 0); |
| Value* mask1 = B->EXTRACT_16(v32Mask, 1); |
| |
| Value* gather0 = |
| B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); |
| Value* gather1 = |
| B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); |
| |
| v32Gather = B->JOIN_16(gather0, gather1); |
| } |
| } |
| } |
| else if (arch == AVX512) |
| { |
| Value* iMask = nullptr; |
| Function* pX86IntrinFunc = nullptr; |
| if (srcTy == B->mFP32Ty) |
| { |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx512_gather_dps_512); |
| iMask = B->BITCAST(vi1Mask, B->mInt16Ty); |
| } |
| else if (srcTy == B->mInt32Ty) |
| { |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx512_gather_dpi_512); |
| iMask = B->BITCAST(vi1Mask, B->mInt16Ty); |
| } |
| else if (srcTy == B->mDoubleTy) |
| { |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx512_gather_dpd_512); |
| iMask = B->BITCAST(vi1Mask, B->mInt8Ty); |
| } |
| else |
| { |
| SWR_ASSERT(false, "Unsupported vector element type for gather."); |
| } |
| |
| auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); |
| v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale}); |
| } |
| |
| return cast<Instruction>(v32Gather); |
| } |
| Instruction* |
| VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) |
| { |
| Builder* B = pThis->B; |
| auto pBase = pCallInst->getArgOperand(0); |
| auto vi1Mask = pCallInst->getArgOperand(1); |
| auto vi32Indices = pCallInst->getArgOperand(2); |
| auto v32Src = pCallInst->getArgOperand(3); |
| auto i32Scale = pCallInst->getArgOperand(4); |
| |
| if (arch != AVX512) |
| { |
| // Call into C function to do the scatter. This has significantly better compile perf |
| // compared to jitting scatter loops for every scatter |
| if (width == W256) |
| { |
| auto mask = B->BITCAST(vi1Mask, B->mInt8Ty); |
| B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale}); |
| } |
| else |
| { |
| // Need to break up 512 wide scatter to two 256 wide |
| auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7})); |
| auto indicesLo = |
| B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7})); |
| auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7})); |
| |
| auto mask = B->BITCAST(maskLo, B->mInt8Ty); |
| B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale}); |
| |
| auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15})); |
| auto indicesHi = |
| B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15})); |
| auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15})); |
| |
| mask = B->BITCAST(maskHi, B->mInt8Ty); |
| B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale}); |
| } |
| return nullptr; |
| } |
| |
| Value* iMask; |
| Function* pX86IntrinFunc; |
| if (width == W256) |
| { |
| // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we |
| // can use the scatter of 8 elements with 64bit indices |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx512_scatter_qps_512); |
| |
| auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty); |
| iMask = B->BITCAST(vi1Mask, B->mInt8Ty); |
| B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale}); |
| } |
| else if (width == W512) |
| { |
| pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx512_scatter_dps_512); |
| iMask = B->BITCAST(vi1Mask, B->mInt16Ty); |
| B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale}); |
| } |
| return nullptr; |
| } |
| |
| // No support for vroundps in avx512 (it is available in kncni), so emulate with avx |
| // instructions |
| Instruction* |
| VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) |
| { |
| SWR_ASSERT(arch == AVX512); |
| |
| auto B = pThis->B; |
| auto vf32Src = pCallInst->getOperand(0); |
| assert(vf32Src); |
| auto i8Round = pCallInst->getOperand(1); |
| assert(i8Round); |
| auto pfnFunc = |
| Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); |
| |
| if (width == W256) |
| { |
| return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round)); |
| } |
| else if (width == W512) |
| { |
| auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0); |
| auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1); |
| |
| auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round); |
| auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round); |
| |
| return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi)); |
| } |
| else |
| { |
| SWR_ASSERT(false, "Unimplemented vector width."); |
| } |
| |
| return nullptr; |
| } |
| |
| Instruction* |
| VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) |
| { |
| SWR_ASSERT(arch == AVX512); |
| |
| auto B = pThis->B; |
| auto vf32Src = pCallInst->getOperand(0); |
| |
| if (width == W256) |
| { |
| auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx_round_ps_256); |
| return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty)); |
| } |
| else if (width == W512) |
| { |
| // 512 can use intrinsic |
| auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, |
| Intrinsic::x86_avx512_mask_cvtpd2ps_512); |
| return cast<Instruction>(B->CALL(pfnFunc, vf32Src)); |
| } |
| else |
| { |
| SWR_ASSERT(false, "Unimplemented vector width."); |
| } |
| |
| return nullptr; |
| } |
| |
| // No support for hsub in AVX512 |
| Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) |
| { |
| SWR_ASSERT(arch == AVX512); |
| |
| auto B = pThis->B; |
| auto src0 = pCallInst->getOperand(0); |
| auto src1 = pCallInst->getOperand(1); |
| |
| // 256b hsub can just use avx intrinsic |
| if (width == W256) |
| { |
| auto pX86IntrinFunc = |
| Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); |
| return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1)); |
| } |
| else if (width == W512) |
| { |
| // 512b hsub can be accomplished with shuf/sub combo |
| auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14})); |
| auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15})); |
| return cast<Instruction>(B->SUB(minuend, subtrahend)); |
| } |
| else |
| { |
| SWR_ASSERT(false, "Unimplemented vector width."); |
| return nullptr; |
| } |
| } |
| |
| // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from |
| // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide |
| Instruction* DOUBLE_EMU(LowerX86* pThis, |
| TargetArch arch, |
| TargetWidth width, |
| CallInst* pCallInst, |
| Intrinsic::ID intrin) |
| { |
| auto B = pThis->B; |
| SWR_ASSERT(width == W512); |
| Value* result[2]; |
| Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin); |
| for (uint32_t i = 0; i < 2; ++i) |
| { |
| SmallVector<Value*, 8> args; |
| for (auto& arg : pCallInst->arg_operands()) |
| { |
| auto argType = arg.get()->getType(); |
| if (argType->isVectorTy()) |
| { |
| uint32_t vecWidth = argType->getVectorNumElements(); |
| Value* lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2); |
| Value* argToPush = B->VSHUFFLE( |
| arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes); |
| args.push_back(argToPush); |
| } |
| else |
| { |
| args.push_back(arg.get()); |
| } |
| } |
| result[i] = B->CALLA(pX86IntrinFunc, args); |
| } |
| uint32_t vecWidth; |
| if (result[0]->getType()->isVectorTy()) |
| { |
| assert(result[1]->getType()->isVectorTy()); |
| vecWidth = result[0]->getType()->getVectorNumElements() + |
| result[1]->getType()->getVectorNumElements(); |
| } |
| else |
| { |
| vecWidth = 2; |
| } |
| Value* lanes = B->CInc<int>(0, vecWidth); |
| return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes)); |
| } |
| |
| } // namespace SwrJit |
| |
| using namespace SwrJit; |
| |
| INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false) |
| INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false) |