| #include "caffe2/opt/optimize_ideep.h" |
| #include "caffe2/opt/converter.h" |
| |
| #ifdef CAFFE2_USE_MKLDNN |
| #include <cpuinfo.h> |
| #include "caffe2/ideep/ideep_utils.h" |
| #endif |
| |
| namespace caffe2 { |
| namespace opt { |
| |
| using namespace nom; |
| |
| #ifndef CAFFE2_USE_MKLDNN |
| void OptimizeForMkldnn( |
| repr::NNModule* nn, |
| caffe2::Workspace* ws, |
| bool training_mode) { |
| LOG(WARNING) << "Only support optimizations for IDEEP"; |
| } |
| |
| #else |
| USE_IDEEP_DEF_ALIASES(); |
| |
| Blob* getBlob(const std::string name, caffe2::Workspace* ws) { |
| CAFFE_ENFORCE(ws->HasBlob(name), "Blob ", name, " not in workspace"); |
| return ws->GetBlob(name); |
| } |
| |
| Blob* getBlob(repr::NNGraph::NodeRef node, caffe2::Workspace* ws) { |
| auto tensor = repr::nn::get<repr::Tensor>(node); |
| return getBlob(tensor->getName(), ws); |
| } |
| |
| template <class T> |
| T getTensor(Blob* blob) { |
| CAFFE_ENFORCE(blob, "Blob is invalid"); |
| return blob->template Get<T>(); |
| } |
| |
| template <class T> |
| T* getMutableTensor(Blob* blob) { |
| CAFFE_ENFORCE(blob, "Blob is invalid"); |
| if (blob && blob->template IsType<T>()) { |
| return blob->template GetMutable<T>(); |
| } |
| return nullptr; |
| } |
| |
| const caffe2::OperatorDef& getOpDef(const repr::NeuralNetOperator& nnOp) { |
| auto annotation = nnOp.getAnnotation(); |
| if (annotation == nullptr) { |
| CAFFE_THROW("Cannot get Operator annotation"); |
| } |
| return dyn_cast<Caffe2Annotation>(annotation)->getOperatorDef(); |
| } |
| |
| caffe2::OperatorDef* getMutableOpDef(repr::NeuralNetOperator& nnOp) { |
| auto annotation = nnOp.getMutableAnnotation(); |
| if (annotation == nullptr) { |
| CAFFE_THROW("Cannot get Operator annotation"); |
| } |
| return dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef(); |
| } |
| |
| bool isOpType(const repr::NNGraph::NodeRef& nodeRef, string typeName) { |
| if (!repr::nn::is<repr::NeuralNetOperator>(nodeRef)) { |
| return false; |
| } |
| auto op = repr::nn::get<repr::NeuralNetOperator>(nodeRef); |
| // NOLINTNEXTLINE(performance-unnecessary-copy-initialization) |
| auto opDef = getOpDef(*op); |
| return opDef.type() == typeName; |
| } |
| |
| bool isOnIdeepDevice(const repr::NeuralNetOperator& nnOp) { |
| // We only want to fuse for IDEEP operators |
| const auto& op = getOpDef(nnOp); |
| return op.device_option().device_type() == DeviceTypeProto::PROTO_IDEEP; |
| } |
| |
| bool isConvFusion(repr::NNGraph::NodeRef convNode, int fusion_type) { |
| // Here we only check the type of ConvFusion op (for FP32 only) |
| if (!repr::nn::is<repr::Conv>(convNode)) { |
| return false; |
| } |
| |
| auto conv = repr::nn::get<repr::Conv>(convNode); |
| auto& op = getOpDef(*conv); |
| |
| if (op.type() == "ConvFusion") { |
| for (const auto& arg : op.arg()) { |
| if (arg.name() == "fusion_type") { |
| if (fusion_type == FUSION_MAX) { |
| return true; |
| } |
| return arg.i() == fusion_type; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| void resetConvForFusion(repr::NNGraph::NodeRef convNode, int fusion_type) { |
| auto conv = repr::nn::get<repr::Conv>(convNode); |
| auto* op = getMutableOpDef(*conv); |
| |
| if (op == nullptr) { |
| return; |
| } |
| |
| if (op->type() == "ConvFusion") { |
| CAFFE_ENFORCE(fusion_type == FUSION_CONV_RELU, "Invalid nest fusion"); |
| for (auto& arg : *op->mutable_arg()) { |
| if (arg.name() == "fusion_type") { |
| CAFFE_ENFORCE(arg.i() == FUSION_CONV_SUM, "Invalid nest fusion"); |
| // Only from FUSION_CONV_SUM to FUSION_CONV_SUM_RELU |
| arg.set_i(FUSION_CONV_SUM_RELU); |
| return; |
| } |
| } |
| CAFFE_THROW("Can not find fusion type in ConvFusion"); |
| } |
| |
| CAFFE_ENFORCE_LT(fusion_type, FUSION_CONV_SUM_RELU, "Invalid fusion type"); |
| op->set_type("ConvFusion"); |
| auto* arg = op->add_arg(); |
| arg->set_name("fusion_type"); |
| arg->set_i(fusion_type); |
| } |
| |
| void removeArg(repr::NeuralNetOperator& nnOp, std::string argName) { |
| auto* op = getMutableOpDef(nnOp); |
| auto& opArgs = *op->mutable_arg(); |
| auto remove_arg = [](decltype(opArgs)& args, std::string& name) { |
| for (auto it = args.begin(); it != args.end(); it++) { |
| if (it->name() == name) { |
| args.erase(it); |
| return true; |
| } |
| } |
| return false; |
| }; |
| while (remove_arg(opArgs, argName)) |
| ; |
| } |
| |
| void moveOpArg( |
| caffe2::Workspace* ws, |
| std::string argName, |
| repr::NeuralNetOperator* srcOp, |
| repr::NeuralNetOperator* dstOp) { |
| if (argName.empty() || srcOp == nullptr || dstOp == nullptr || srcOp == dstOp) |
| return; |
| removeArg(*dstOp, argName); |
| |
| auto& src = getOpDef(*srcOp); |
| auto& src_args = src.arg(); |
| auto src_it = src_args.begin(); |
| for (; src_it != src_args.end(); src_it++) { |
| if (src_it->name() == argName) |
| break; |
| } |
| if (src_it == src_args.end()) |
| return; |
| |
| auto* dst = getMutableOpDef(*dstOp); |
| auto* arg = dst->add_arg(); |
| *arg = *src_it; |
| arg->set_name(argName); |
| } |
| |
| bool removeStopGradientForInference(repr::NNModule* nn, caffe2::Workspace* ws) { |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| // NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare) |
| for (int i = 0; i < allNodes.size(); ++i) { |
| auto node = allNodes[i]; |
| if (!isOpType(node, "StopGradient")) { |
| continue; |
| } |
| |
| auto stopGradInput = repr::nn::getInputs(node).front(); |
| auto stopGradOutput = repr::nn::getOutputs(node).front(); |
| auto inputName = repr::nn::get<repr::Tensor>(stopGradInput)->getName(); |
| auto outputName = repr::nn::get<repr::Tensor>(stopGradOutput)->getName(); |
| if (inputName == outputName) { |
| nn->dataFlow.replaceNode(stopGradOutput, stopGradInput); |
| nn->dataFlow.deleteNode(node); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool fuseConvBNAndAffCh(repr::NNModule* nn, caffe2::Workspace* ws) { |
| for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) { |
| bool no_bias = false; |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| repr::NNGraph::NodeRef convNode; |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| repr::Conv* conv; |
| std::tie(conv, convNode) = node_pair; |
| |
| if (!isOnIdeepDevice(*conv)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| const auto& convOp = getOpDef(*conv); |
| if (convOp.type() == "ConvFusion") { |
| continue; |
| } |
| |
| auto convOutput = repr::nn::getOutputs(convNode).front(); |
| auto consumers = repr::nn::getConsumers(convOutput); |
| // convOutput is NOT referenced by sequential ops after BN. |
| if (consumers.size() != 1) { |
| continue; |
| } |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| bool isBN; |
| auto consumer = consumers.front(); |
| if (repr::nn::is<repr::BatchNormalization>(consumer)) { |
| isBN = true; |
| } else if (isOpType(consumer, "AffineChannel")) { |
| isBN = false; |
| } else { |
| continue; |
| } |
| |
| auto bnOrAffChNode = consumer; |
| auto bn = |
| isBN ? repr::nn::get<repr::BatchNormalization>(bnOrAffChNode) : nullptr; |
| auto bnOrAffChOutput = repr::nn::getOutputs(bnOrAffChNode).front(); |
| |
| auto convInputs = repr::nn::getInputs(convNode); |
| if (convInputs.size() < 2) { |
| LOG(WARNING) << "Invalid convolution input size"; |
| continue; |
| } |
| |
| auto bnOrAffChInputs = repr::nn::getInputs(bnOrAffChNode); |
| int numInputs = isBN ? 5 : 3; |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| if (bnOrAffChInputs.size() < numInputs) { |
| LOG(WARNING) << "Invalid input size: " << bnOrAffChInputs.size() |
| << ", expect " << numInputs; |
| continue; |
| } |
| |
| // When no bias, borrow BN bias |
| if (convInputs.size() < 3) { |
| no_bias = true; |
| nn->dataFlow.createEdge(bnOrAffChInputs[2], convNode); |
| convInputs = repr::nn::getInputs(convNode); |
| } |
| |
| #define EXPOSE_TENSOR_DATA(name, index, nodes, need_init) \ |
| itensor* name = nullptr; \ |
| itensor name##Tensor; \ |
| float* name##Data = nullptr; \ |
| if (need_init) { \ |
| name = getMutableTensor<itensor>(getBlob(nodes[index], ws)); \ |
| if (name == nullptr) { \ |
| LOG(WARNING) << #name " not a IDEEP tensor"; \ |
| continue; \ |
| } \ |
| name##Tensor.resize(name->get_dims(), name->get_data_type()); \ |
| name##Tensor.feed_from(*name); \ |
| CAFFE_ENFORCE( \ |
| name##Tensor.is_public_format(), #name " not with public format"); \ |
| name##Data = static_cast<float*>(name##Tensor.get_data_handle()); \ |
| } |
| |
| EXPOSE_TENSOR_DATA(filter, 1, convInputs, true); |
| EXPOSE_TENSOR_DATA(biasConv, 2, convInputs, true); |
| |
| EXPOSE_TENSOR_DATA(scale, 1, bnOrAffChInputs, true); |
| EXPOSE_TENSOR_DATA(biasBNOrAffCh, 2, bnOrAffChInputs, true); |
| EXPOSE_TENSOR_DATA(mean, 3, bnOrAffChInputs, isBN); |
| EXPOSE_TENSOR_DATA(variance, 4, bnOrAffChInputs, isBN); |
| |
| #undef EXPOSE_TENSOR_DATA |
| |
| // Assume M{CHW,HWC} |
| auto chwDim = filterTensor.get_dim(1) * filterTensor.get_dim(2) * |
| filterTensor.get_dim(3); |
| for (auto c = 0; c < filterTensor.get_dim(0); ++c) { |
| float mean_val = 0; |
| float variance_val = 1; |
| if (isBN) { |
| mean_val = meanData[c]; |
| variance_val = std::sqrt(varianceData[c] + bn->getEpsilon()); |
| } |
| float coeff = scaleData[c] / variance_val; |
| for (auto i = 0; i < chwDim; ++i) { |
| filterData[c * chwDim + i] *= coeff; |
| } |
| |
| if (no_bias) { |
| biasConvData[c] = biasBNOrAffChData[c] - mean_val * coeff; |
| } else { |
| biasConvData[c] = |
| biasBNOrAffChData[c] + (biasConvData[c] - mean_val) * coeff; |
| } |
| } |
| |
| filter->feed_from(filterTensor); |
| biasConv->feed_from(biasConvTensor); |
| nn->dataFlow.replaceNode(convOutput, bnOrAffChOutput); |
| |
| nn->dataFlow.deleteNode(bnOrAffChNode); |
| nn->dataFlow.deleteNode(convOutput); |
| |
| return true; |
| } |
| return false; |
| } |
| |
| bool fuseConvSum(repr::NNModule* nn, caffe2::Workspace* ws) { |
| CAFFE_ENFORCE(cpuinfo_initialize(), "failed to initialize cpuinfo"); |
| // Assume the order of nodes from getMutableNodes conforms to |
| // the original topo order of operators |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) |
| for (int i = allNodes.size() - 1; i > 0; i--) { |
| auto sumNode = allNodes[i]; |
| if (!repr::nn::hasInputs(sumNode)) { |
| continue; |
| } |
| |
| // [Caution] on IDEEP device, only element-wise Add operator is |
| // supported yet. It totally works as element-wise sum without scalar |
| // broadcast. |
| bool is_dnnlowp_sum = false; |
| if (isOpType(sumNode, "Int8Sum") || isOpType(sumNode, "Int8Add") || |
| isOpType(sumNode, "Int8SumRelu") || isOpType(sumNode, "Int8AddRelu")) { |
| is_dnnlowp_sum = true; |
| } else if (!repr::nn::is<repr::Sum>(sumNode) && !isOpType(sumNode, "Add")) { |
| continue; |
| } |
| |
| auto sum = repr::nn::get<repr::NeuralNetOperator>(sumNode); |
| if (!isOnIdeepDevice(*sum)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto sumInputs = repr::nn::getInputs(sumNode); |
| if (sumInputs.size() != 2) { |
| continue; |
| } |
| |
| int sum_idx = i; |
| repr::NNGraph::NodeRef convNode = nullptr; |
| while (--i >= 0) { |
| if (repr::nn::is<repr::NeuralNetOperator>(allNodes[i])) { |
| // Find the nearest conv Op before Sum |
| if (repr::nn::is<repr::Conv>(allNodes[i]) || |
| isOpType(allNodes[i], "Int8Conv")) { |
| convNode = allNodes[i]; |
| break; |
| } |
| } |
| } |
| if (convNode == nullptr || isConvFusion(convNode, FUSION_MAX)) { |
| continue; |
| } |
| int conv_idx = i; |
| |
| auto conv = repr::nn::get<repr::NeuralNetOperator>(convNode); |
| if (!isOnIdeepDevice(*conv)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto group = 1; |
| auto* convOp = getMutableOpDef(*conv); |
| for (const auto& arg : convOp->arg()) { |
| if (arg.name() == "group") { |
| group = arg.i(); |
| break; |
| } |
| } |
| if (group > 1 && !cpuinfo_has_x86_avx512f()) { |
| LOG(WARNING) << "Not support conv sum fusion with grouped filter"; |
| continue; |
| } |
| |
| auto convOutput = repr::nn::getOutputs(convNode).front(); |
| if (convOutput != sumInputs[0] && convOutput != sumInputs[1]) { |
| continue; |
| } |
| repr::NNGraph::NodeRef sumInputX = |
| (sumInputs[0] == convOutput ? sumInputs[1] : sumInputs[0]); |
| CAFFE_ENFORCE(sumInputX != nullptr, "Invalid sum inputs"); |
| if (sumInputX->getInEdges().size() <= 0) { |
| continue; |
| } |
| |
| auto preNode = repr::nn::getProducer(sumInputX); |
| if (preNode == nullptr || !repr::nn::is<repr::NeuralNetOperator>(preNode)) { |
| LOG(WARNING) << "Can not fuse Conv Sum"; |
| continue; |
| } |
| int pre_idx = sum_idx - 1; |
| while (pre_idx >= 0) { |
| if (preNode == allNodes[pre_idx]) { |
| break; |
| } |
| pre_idx--; |
| } |
| |
| bool should_fuse = true; |
| auto convInput = repr::nn::getInputs(convNode).front(); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int idx = conv_idx + 1; idx < allNodes.size() - 1; ++idx) { |
| if (idx == sum_idx || |
| !repr::nn::is<repr::NeuralNetOperator>(allNodes[idx])) { |
| continue; |
| } |
| |
| auto checkNode = allNodes[idx]; |
| auto checkInputs = repr::nn::getInputs(checkNode); |
| // Conv output should not be used by other ops after Conv node (except the |
| // fused Sum) The other Sum input (sumInputX) should not be used by the |
| // other ops after Sum node due to the Sum output is inplace with |
| // sumInputX |
| // NOLINTNEXTLINE(modernize-loop-convert) |
| for (size_t input_idx = 0; input_idx < checkInputs.size(); ++input_idx) { |
| if (convOutput == checkInputs[input_idx] || |
| (idx > sum_idx && sumInputX == checkInputs[input_idx])) { |
| should_fuse = false; |
| break; |
| } |
| } |
| if (!should_fuse) { |
| break; |
| } |
| |
| // If fuse Conv with Sum, the Conv op will be pulled down between preNode |
| // and Sum Check Conv input tensor buffer has been re-written by other ops |
| // between Conv and preNode |
| if (idx <= pre_idx) { |
| auto checkOutputs = repr::nn::getOutputs(checkNode); |
| // NOLINTNEXTLINE(modernize-loop-convert) |
| for (size_t output_idx = 0; output_idx < checkOutputs.size(); |
| ++output_idx) { |
| auto check_output_tensor = |
| repr::nn::get<repr::Tensor>(checkOutputs[output_idx]); |
| auto conv_input_tensor = repr::nn::get<repr::Tensor>(convInput); |
| if (conv_input_tensor->getName() == check_output_tensor->getName()) { |
| should_fuse = false; |
| break; |
| } |
| } |
| } |
| if (!should_fuse) { |
| break; |
| } |
| } |
| if (!should_fuse) { |
| continue; |
| } |
| |
| nn->dataFlow.createEdge(sumInputX, convNode); |
| auto newOutputName = repr::nn::get<repr::Tensor>(sumInputX)->getName() + |
| "_fusion_fix_" + std::to_string(i); |
| |
| auto newInputTensor = std::make_unique<repr::Tensor>(newOutputName); |
| auto newInput = nn->dataFlow.createNode( |
| unique_dyn_cast<repr::NeuralNetData>(newInputTensor)); |
| |
| nn->dataFlow.replaceNode(sumInputX, newInput); |
| nn->dataFlow.deleteNode(sumInputX); |
| |
| auto newOutputTensor = std::make_unique<repr::Tensor>(newOutputName); |
| auto newOutput = nn->dataFlow.createNode( |
| unique_dyn_cast<repr::NeuralNetData>(newOutputTensor)); |
| |
| auto sumOutput = repr::nn::getOutputs(sumNode).front(); |
| nn->dataFlow.replaceNode(sumOutput, newOutput); |
| nn->dataFlow.createEdge(convNode, newOutput); |
| |
| if (!is_dnnlowp_sum) { |
| resetConvForFusion(convNode, FUSION_CONV_SUM); |
| } else { |
| moveOpArg(ws, "Y_scale", sum, conv); |
| moveOpArg(ws, "Y_zero_point", sum, conv); |
| |
| if (isOpType(sumNode, "Int8Sum") || isOpType(sumNode, "Int8Add")) { |
| convOp->set_type("Int8ConvSum"); |
| } else if ( |
| isOpType(sumNode, "Int8SumRelu") || |
| isOpType(sumNode, "Int8AddRelu")) { |
| convOp->set_type("Int8ConvSumRelu"); |
| } else { |
| CAFFE_THROW("Unsupport operator in conv fusion"); |
| } |
| } |
| |
| nn->dataFlow.deleteNode(sumNode); |
| nn->dataFlow.deleteNode(sumOutput); |
| nn->dataFlow.deleteNode(convOutput); |
| return true; |
| } |
| return false; |
| } |
| |
| bool fuseActivation(repr::NNModule* nn, caffe2::Workspace* ws) { |
| // Conv+Relu fusion |
| for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) { |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| repr::NNGraph::NodeRef conv_node; |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| repr::Conv* conv; |
| std::tie(conv, conv_node) = node_pair; |
| |
| // Check topological feasibility |
| auto conv_outputs = repr::nn::getOutputs(conv_node); |
| if (conv_outputs.size() != 1) { |
| continue; |
| } |
| auto conv_output = conv_outputs.front(); |
| |
| auto consumers = repr::nn::getConsumers(conv_output); |
| if (consumers.size() != 1) { |
| continue; |
| } |
| if (!repr::nn::is<repr::Relu>(consumers.front())) { |
| continue; |
| } |
| auto relu_node = consumers.front(); |
| |
| auto relu_outputs = repr::nn::getOutputs(relu_node); |
| if (relu_outputs.size() != 1) { |
| continue; |
| } |
| |
| // Check feasibility with application specific logic |
| if (!isOnIdeepDevice(*conv)) { |
| continue; |
| } |
| |
| // Ready to fuse |
| auto relu_output = relu_outputs.front(); |
| auto output_tensor = repr::nn::get<repr::Tensor>(relu_output); |
| auto output_node = relu_output; |
| auto input_tensor = |
| repr::nn::get<repr::Tensor>(repr::nn::getInputs(conv_node).front()); |
| |
| if (isConvFusion(conv_node, FUSION_CONV_SUM)) { |
| nn->dataFlow.replaceNode(relu_output, conv_output); |
| nn->dataFlow.deleteNode(relu_node); |
| nn->dataFlow.deleteNode(relu_output); |
| } else { |
| // Conv cannot be in-place |
| if (output_tensor->getName() != input_tensor->getName()) { |
| nn->dataFlow.replaceNode(conv_output, relu_output); |
| nn->dataFlow.deleteNode(relu_node); |
| nn->dataFlow.deleteNode(conv_output); |
| } else { |
| nn->dataFlow.replaceNode(relu_output, conv_output); |
| output_tensor = repr::nn::get<repr::Tensor>(conv_output); |
| output_node = conv_output; |
| nn->dataFlow.deleteNode(relu_node); |
| nn->dataFlow.deleteNode(relu_output); |
| } |
| |
| // We may have accidentally made the next op in-place |
| // In future iterations of transformations this won't be an issue, |
| // but current caffe2 predictor usage requires things like |
| // external_input and output to be unchanged. |
| bool rectify_inplace = false; |
| for (auto& consumer : repr::nn::getConsumers(output_node)) { |
| for (auto& consumer_output : repr::nn::getOutputs(consumer)) { |
| auto co_name = |
| repr::nn::get<repr::Tensor>(consumer_output)->getName(); |
| if (co_name == output_tensor->getName()) { |
| rectify_inplace = true; |
| } |
| } |
| } |
| if (rectify_inplace) { |
| auto new_output = nn->dataFlow.createNode(make_unique<repr::Tensor>( |
| output_tensor->getName() + "_fusion_fix")); |
| nn->dataFlow.replaceNode(output_node, new_output); |
| } |
| } |
| |
| resetConvForFusion(conv_node, FUSION_CONV_RELU); |
| return true; |
| } |
| return false; |
| } |
| |
| bool enforceFusionInplace(repr::NNModule* nn, caffe2::Workspace* ws) { |
| // For fusions of Conv+Sum or Conv+Sum+ReLU, the last input and output must |
| // be inplaced. To enforce inplace, here to re-check whole graph and correct |
| // the ConvFusion Ops. |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) |
| for (int i = allNodes.size() - 1; i > 0; i--) { |
| auto convNode = allNodes[i]; |
| if (convNode == nullptr || |
| !repr::nn::is<repr::NeuralNetOperator>(convNode)) { |
| continue; |
| } |
| |
| auto conv = repr::nn::get<repr::NeuralNetOperator>(convNode); |
| if (!isOnIdeepDevice(*conv)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| if (repr::nn::is<repr::Conv>(convNode)) { |
| if (!isConvFusion(convNode, FUSION_CONV_SUM) && |
| !isConvFusion(convNode, FUSION_CONV_SUM_RELU)) |
| continue; |
| } else if ( |
| !isOpType(convNode, "Int8ConvSum") && |
| !isOpType(convNode, "Int8ConvSumRelu")) { |
| continue; |
| } |
| |
| auto convInput = repr::nn::getInputs(convNode).back(); |
| auto inputName = repr::nn::get<repr::Tensor>(convInput)->getName(); |
| auto convOutput = repr::nn::getOutputs(convNode).front(); |
| auto outputName = repr::nn::get<repr::Tensor>(convOutput)->getName(); |
| if (inputName == outputName) { |
| continue; |
| } |
| |
| auto consumer = repr::nn::getConsumers(convInput).back(); |
| if (consumer != convNode) { |
| LOG(ERROR) << "Can not enforce to inplace for fusion"; |
| return false; |
| } |
| |
| auto newOutputTensor = std::make_unique<repr::Tensor>(inputName); |
| auto newOutput = nn->dataFlow.createNode( |
| unique_dyn_cast<repr::NeuralNetData>(newOutputTensor)); |
| nn->dataFlow.replaceNode(convOutput, newOutput); |
| nn->dataFlow.deleteNode(convOutput); |
| |
| return true; |
| } |
| return false; |
| } |
| |
| bool fuseOrderSwitchToQuantizeOp(repr::NNModule* nn, caffe2::Workspace* ws) { |
| // In INT8 module, the quantize/dequantize op always appears |
| // along with corresponding order switch op, which aims to switch |
| // between INT8 computation domain and others. |
| // Here we assume they always obey below combination and order: |
| // NCHW2NHWC followed by Int8Quantize, or Int8Dequantize followed by NHWC2NCHW |
| // On iDEEP, there is chance to fuse the order switch op into the |
| // quantize/dequantize op, in order to improve the module performance. |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| // NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare) |
| for (int i = 0; i < allNodes.size(); ++i) { |
| auto osNode = allNodes[i]; |
| if (osNode == nullptr || !repr::nn::is<repr::NeuralNetOperator>(osNode)) { |
| continue; |
| } |
| |
| if (isOpType(osNode, "NCHW2NHWC")) { |
| auto output = repr::nn::getOutputs(osNode).front(); |
| auto consumers = repr::nn::getConsumers(output); |
| if (consumers.size() != 1) { |
| continue; |
| } |
| |
| auto seqNode = consumers.front(); |
| if (!isOpType(seqNode, "Int8Quantize")) { |
| continue; |
| } |
| |
| auto seq = repr::nn::get<repr::NeuralNetOperator>(seqNode); |
| removeArg(*seq, "output_order"); |
| |
| auto* seqOp = getMutableOpDef(*seq); |
| auto* arg = seqOp->add_arg(); |
| arg->set_name("output_order"); |
| arg->set_i(static_cast<int64_t>(iformat::nhwc)); |
| |
| auto input = repr::nn::getInputs(osNode).front(); |
| nn->dataFlow.replaceNode(output, input); |
| |
| nn->dataFlow.deleteNode(osNode); |
| nn->dataFlow.deleteNode(output); |
| return true; |
| } else if (isOpType(osNode, "NHWC2NCHW")) { |
| auto input = repr::nn::getInputs(osNode).front(); |
| if (input->getInEdges().size() <= 0) { |
| continue; |
| } |
| |
| auto preNode = repr::nn::getProducer(input); |
| if (!isOpType(preNode, "Int8Dequantize")) { |
| continue; |
| } |
| |
| auto pre = repr::nn::get<repr::NeuralNetOperator>(preNode); |
| removeArg(*pre, "output_order"); |
| |
| auto* preOp = getMutableOpDef(*pre); |
| auto* arg = preOp->add_arg(); |
| arg->set_name("output_order"); |
| arg->set_i(static_cast<int64_t>(iformat::nchw)); |
| |
| auto output = repr::nn::getOutputs(osNode).front(); |
| nn->dataFlow.replaceNode(input, output); |
| |
| nn->dataFlow.deleteNode(osNode); |
| nn->dataFlow.deleteNode(input); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool fusePreConvertOp(repr::NNModule* nn, caffe2::Workspace* ws) { |
| // 1. Int8Sum has been fallbacked to FP32 in current impl |
| // It can handle inputs with diff format and data type |
| // 2. FC is able to convert input format and data type by itself |
| // 3. The fallback wrapper can handle the conversion of format and data type |
| static vector<string> op_list = { |
| "FC", |
| "Python", |
| "Softmax", |
| "Sigmoid", |
| "RoIAlign", |
| "UpsampleNearest", |
| "BatchPermutation", |
| "Int8Sum", |
| "Int8SumRelu", |
| }; |
| |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| // NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare) |
| for (int i = 0; i < allNodes.size(); ++i) { |
| auto opNode = allNodes[i]; |
| if (opNode == nullptr || !repr::nn::is<repr::NeuralNetOperator>(opNode)) { |
| continue; |
| } |
| |
| if (!isOpType(opNode, "NCHW2NHWC") && !isOpType(opNode, "NHWC2NCHW") && |
| !isOpType(opNode, "Int8Quantize") && |
| !isOpType(opNode, "Int8Dequantize")) { |
| continue; |
| } |
| |
| auto op = repr::nn::get<repr::NeuralNetOperator>(opNode); |
| if (!isOnIdeepDevice(*op)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto output = repr::nn::getOutputs(opNode).front(); |
| auto consumers = repr::nn::getConsumers(output); |
| if (consumers.size() != 1) { |
| continue; |
| } |
| |
| bool is_op_found = false; |
| auto seqNode = consumers.front(); |
| // NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare) |
| for (int j = 0; j < op_list.size(); j++) { |
| if (isOpType(seqNode, op_list[j])) { |
| is_op_found = true; |
| break; |
| } |
| } |
| if (!is_op_found) { |
| continue; |
| } |
| |
| auto seqOp = repr::nn::get<repr::NeuralNetOperator>(seqNode); |
| if (!isOnIdeepDevice(*seqOp)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto input = repr::nn::getInputs(opNode).front(); |
| |
| if (isOpType(opNode, "Int8Dequantize") && |
| repr::nn::hasSingleOutputAndConsumer(opNode)) { |
| auto preNode = repr::nn::getProducer(input); |
| if (isOpType(preNode, "Int8FC") && |
| repr::nn::hasSingleOutputAndConsumer(preNode)) { |
| auto predOp = repr::nn::get<repr::NeuralNetOperator>(preNode); |
| removeArg(*predOp, "Y_scale"); |
| removeArg(*predOp, "Y_zero_point"); |
| } |
| } |
| |
| nn->dataFlow.replaceNode(output, input); |
| |
| nn->dataFlow.deleteNode(opNode); |
| nn->dataFlow.deleteNode(output); |
| return true; |
| } |
| return false; |
| } |
| |
| void setPoolingInferenceMode(repr::NNModule* nn) { |
| auto setTrainingMode = [](repr::NeuralNetOperator& pool) { |
| if (!isOnIdeepDevice(pool)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| return; |
| } |
| auto* op = getMutableOpDef(pool); |
| bool found_training_mode = false; |
| for (auto& arg : *op->mutable_arg()) { |
| if (arg.name() == "training_mode") { |
| arg.set_i(0); |
| found_training_mode = true; |
| break; |
| } |
| } |
| if (!found_training_mode) { |
| auto* arg = op->add_arg(); |
| arg->set_name("training_mode"); |
| arg->set_i(0); |
| } |
| }; |
| |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| // NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare) |
| for (int i = 0; i < allNodes.size(); ++i) { |
| auto poolNode = allNodes[i]; |
| if (poolNode == nullptr || |
| !repr::nn::is<repr::NeuralNetOperator>(poolNode)) { |
| continue; |
| } |
| |
| if (isOpType(poolNode, "FC") || isOpType(poolNode, "Conv") || |
| isOpType(poolNode, "ConvFusion") || isOpType(poolNode, "MaxPool") || |
| isOpType(poolNode, "AveragePool") || isOpType(poolNode, "Int8FC") || |
| isOpType(poolNode, "Int8Conv") || isOpType(poolNode, "Int8ConvRelu") || |
| isOpType(poolNode, "Int8ConvSum") || |
| isOpType(poolNode, "Int8ConvSumRelu") || |
| isOpType(poolNode, "Int8MaxPool") || |
| isOpType(poolNode, "Int8AveragePool")) { |
| auto pool = repr::nn::get<repr::NeuralNetOperator>(poolNode); |
| setTrainingMode(*pool); |
| } |
| } |
| } |
| |
| // Pre-convert filters format to expected one here |
| // in order to avoid boring conversions during computations |
| void preConvertFiltersFormat(repr::NNModule* nn, caffe2::Workspace* ws) { |
| for (auto& node : nn->dataFlow.getMutableNodes()) { |
| if (!repr::nn::is<repr::ConvTranspose>(node) && |
| !repr::nn::is<repr::Conv>(node) && !repr::nn::is<repr::FC>(node)) { |
| continue; |
| } |
| |
| auto* nnOp = repr::nn::get<repr::NeuralNetOperator>(node); |
| if (!isOnIdeepDevice(*nnOp)) { |
| LOG(INFO) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto inputs = repr::nn::getInputs(node); |
| if (inputs.size() < 2) { |
| LOG(WARNING) << "Invalid input size"; |
| continue; |
| } |
| |
| auto* filterBlob = getBlob(inputs[1], ws); |
| auto* filter = getMutableTensor<itensor>(filterBlob); |
| if (filter == nullptr) { |
| continue; |
| } |
| |
| itensor::descriptor expectedDesc; |
| if (repr::nn::is<repr::ConvTranspose>(node)) { |
| if (filter->get_desc().is_iohw()) |
| continue; |
| auto convTranspose = repr::nn::get<repr::ConvTranspose>(node); |
| auto initValue = [](vector<int>& v, vector<int> i) { |
| if (v.empty()) |
| v = i; |
| }; |
| auto strides = convTranspose->getStrides(); |
| initValue(strides, {1, 1}); |
| auto pads = convTranspose->getPads(); |
| initValue(pads, {0, 0, 0, 0}); |
| auto dataType = filter->get_data_type(); |
| ideep::tensor::dims filter_dims_mkldnn{filter->get_dim(1), |
| filter->get_dim(0), |
| filter->get_dim(2), |
| filter->get_dim(3)}; |
| expectedDesc = |
| ideep::convolution_transpose_forward::expected_weights_desc( |
| filter_dims_mkldnn, |
| dataType, |
| {strides.begin(), strides.end()}, |
| {pads[0], pads[1]}, |
| {pads[2], pads[3]}); |
| |
| if (filter->get_descriptor() != expectedDesc) { |
| itensor newFilter; |
| newFilter.init(expectedDesc); |
| newFilter.feed_from(*filter); |
| filterBlob->Reset<itensor>(new itensor(std::move(newFilter))); |
| } |
| } else if (repr::nn::is<repr::Conv>(node)) { |
| auto conv = repr::nn::get<repr::Conv>(node); |
| auto initValue = [](vector<int>& v, vector<int> i) { |
| if (v.empty()) |
| v = i; |
| }; |
| auto strides = conv->getStrides(); |
| initValue(strides, {1, 1}); |
| auto pads = conv->getPads(); |
| initValue(pads, {0, 0, 0, 0}); |
| auto dilations = conv->getDilations(); |
| initValue(dilations, {1, 1}); |
| |
| auto* op = getMutableOpDef(*conv); |
| auto aalgorithm = ialgo::convolution_direct; |
| for (auto& arg : *op->mutable_arg()) { |
| if ((arg.name() == "conv_algorithm") && |
| (arg.i() == CONV_ALGORITHM_WINOGRAD)) { |
| aalgorithm = ialgo::convolution_winograd; |
| } |
| } |
| |
| expectedDesc = ideep::convolution_forward::expected_weights_desc( |
| filter->get_dims(), |
| filter->get_data_type(), |
| {strides.begin(), strides.end()}, |
| {pads[0], pads[1]}, |
| {pads[2], pads[3]}, |
| {dilations.begin(), dilations.end()}, |
| conv->getGroup(), |
| aalgorithm); |
| |
| if (filter->get_descriptor() != expectedDesc) { |
| itensor newFilter; |
| newFilter.init(expectedDesc); |
| newFilter.feed_from(*filter); |
| filterBlob->Reset<itensor>(new itensor(std::move(newFilter))); |
| } |
| // convert weights for FC |
| } else if (repr::nn::is<repr::FC>(node)) { |
| auto fc = repr::nn::get<repr::FC>(node); |
| auto axis_w = fc->getAxisW(); |
| if (axis_w != 1) { |
| auto f_dims = filter->get_dims(); |
| auto f_dim0 = std::accumulate( |
| f_dims.begin(), |
| f_dims.begin() + axis_w, |
| 1, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::multiplies<itensor::dim_t>()); |
| auto f_dim1 = std::accumulate( |
| f_dims.begin() + axis_w, |
| f_dims.end(), |
| 1, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::multiplies<itensor::dim_t>()); |
| filter->reshape({f_dim0, f_dim1}); |
| } |
| |
| expectedDesc = ideep::inner_product_forward::expected_weights_desc( |
| filter->get_dims()); |
| |
| if (filter->get_descriptor() != expectedDesc) { |
| itensor newFilter; |
| newFilter.init(expectedDesc); |
| newFilter.feed_from(*filter); |
| filterBlob->Reset<itensor>(new itensor(std::move(newFilter))); |
| } |
| } |
| } |
| } |
| |
| // Fusers for ideep to parse the graph and apply operator fusion |
| using Fuser = bool (*)(repr::NNModule* nn, caffe2::Workspace* ws); |
| // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) |
| static Fuser fusers[] = { |
| removeStopGradientForInference, |
| fuseConvBNAndAffCh, |
| fuseConvSum, |
| fuseActivation, |
| enforceFusionInplace, |
| fuseOrderSwitchToQuantizeOp, |
| fusePreConvertOp, |
| }; |
| |
| void OptimizeForMkldnn( |
| repr::NNModule* nn, |
| caffe2::Workspace* ws, |
| bool training_mode) { |
| if (training_mode) { |
| preConvertFiltersFormat(nn, ws); |
| return; |
| } |
| |
| for (auto fuser : fusers) { |
| while (fuser(nn, ws)) { |
| } |
| } |
| |
| setPoolingInferenceMode(nn); |
| } |
| |
| #endif // CAFFE2_USE_MKLDNN |
| |
| } // namespace opt |
| } // namespace caffe2 |