| #include "caffe2/opt/optimize_ideep.h" |
| #include "caffe2/opt/converter.h" |
| #include "caffe2/opt/fusion.h" |
| |
| #ifdef CAFFE2_USE_MKLDNN |
| #include "caffe2/ideep/ideep_utils.h" |
| #endif |
| |
| namespace caffe2 { |
| namespace opt { |
| |
| using namespace nom; |
| |
| #ifndef CAFFE2_USE_MKLDNN |
| void OptimizeForMkldnn( |
| repr::NNModule* nn, |
| caffe2::Workspace* ws, |
| bool training_mode) { |
| LOG(WARNING) << "Only support optimizations for IDEEP"; |
| } |
| |
| #else |
| USE_IDEEP_DEF_ALIASES(); |
| |
| Blob* getBlob(repr::NNGraph::NodeRef node, caffe2::Workspace* ws) { |
| auto tensor = repr::nn::get<repr::Tensor>(node); |
| CAFFE_ENFORCE(ws->HasBlob(tensor->getName()), "Blob not in workspace"); |
| return ws->GetBlob(tensor->getName()); |
| } |
| |
| template <class T> |
| T* getTensor(Blob* blob) { |
| CAFFE_ENFORCE(blob, "Blob is invalid"); |
| if (blob && blob->template IsType<T>()) { |
| return blob->template GetMutable<T>(); |
| } |
| return nullptr; |
| } |
| |
| template <class T> |
| T* getMutableTensor(Blob* blob) { |
| CAFFE_ENFORCE(blob, "Blob is invalid"); |
| if (blob->template IsType<T>()) { |
| return blob->template GetMutable<T>(); |
| } |
| return nullptr; |
| } |
| |
| const caffe2::OperatorDef& getOpDef(const repr::NeuralNetOperator& nnOp) { |
| auto annotation = nnOp.getAnnotation(); |
| if (annotation == nullptr) { |
| CAFFE_THROW("Cannot get Operator annotation"); |
| } |
| return dyn_cast<Caffe2Annotation>(annotation)->getOperatorDef(); |
| } |
| |
| caffe2::OperatorDef* getMutableOpDef(repr::NeuralNetOperator& nnOp) { |
| auto annotation = nnOp.getMutableAnnotation(); |
| if (annotation == nullptr) { |
| CAFFE_THROW("Cannot get Operator annotation"); |
| } |
| return dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef(); |
| } |
| |
| bool isOpType(const repr::NNGraph::NodeRef& nodeRef, string typeName) { |
| if (!repr::nn::is<repr::NeuralNetOperator>(nodeRef)) { |
| return false; |
| } |
| auto op = repr::nn::get<repr::NeuralNetOperator>(nodeRef); |
| auto opDef = getOpDef(*op); |
| return opDef.type() == typeName; |
| } |
| |
| bool isOnIdeepDevice(const repr::NeuralNetOperator& nnOp) { |
| // We only want to fuse for IDEEP convs |
| const auto& op = getOpDef(nnOp); |
| return op.device_option().device_type() == DeviceTypeProto::PROTO_IDEEP; |
| } |
| |
| bool shouldFuseConv(const repr::Conv& conv) { |
| return isOnIdeepDevice(conv) ? (conv.getGroup() <= 1) : false; |
| } |
| |
| void removeStopGradientForInference(repr::NNModule* nn) { |
| auto isStopGradientNode = [](const repr::NNGraph::NodeRef& node) { |
| if (!repr::nn::is<repr::NeuralNetOperator>(node)) { |
| return false; |
| } |
| auto maybeStopGrad = repr::nn::get<repr::NeuralNetOperator>(node); |
| auto maybeStopGradDef = getOpDef(*maybeStopGrad); |
| return maybeStopGradDef.type() == "StopGradient"; |
| }; |
| |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| for (int i = 0; i < allNodes.size(); ++i) { |
| auto node = allNodes[i]; |
| if (!isStopGradientNode(node)) { |
| continue; |
| } |
| |
| auto stopGradInput = repr::nn::getInputs(node).front(); |
| auto stopGradOutput = repr::nn::getOutputs(node).front(); |
| auto inputName = repr::nn::get<repr::Tensor>(stopGradInput)->getName(); |
| auto outputName = repr::nn::get<repr::Tensor>(stopGradOutput)->getName(); |
| if (inputName == outputName) { |
| nn->dataFlow.replaceNode(stopGradOutput, stopGradInput); |
| nn->dataFlow.deleteNode(node); |
| } |
| } |
| } |
| |
| void resetConvForFusion(repr::NNGraph::NodeRef convNode, int fusion_type) { |
| // Fusion types: |
| // FUSION_CONV_RELU = 1 |
| // FUSION_CONV_SUM = 2 |
| // FUSION_CONV_SUM_RELU = 3 |
| auto conv = repr::nn::get<repr::Conv>(convNode); |
| auto annotation = conv->getMutableAnnotation(); |
| if (!annotation || !isa<Caffe2Annotation>(annotation)) { |
| return; |
| } |
| |
| auto* op = getMutableOpDef(*conv); |
| if (op == nullptr) { |
| return; |
| } |
| |
| if (op->type() == "ConvFusion") { |
| CAFFE_ENFORCE(fusion_type == 1, "Invalid nest fusion"); |
| for (auto& arg : *op->mutable_arg()) { |
| if (arg.name() == "fusion_type") { |
| // Only from FUSION_CONV_SUM to FUSION_CONV_SUM_RELU |
| CAFFE_ENFORCE(arg.i() == 2, "Invalid nest fusion"); |
| arg.set_i(3); |
| return; |
| } |
| } |
| return; |
| } |
| |
| CAFFE_ENFORCE(fusion_type < 3, "Invalid fusion type"); |
| op->set_type("ConvFusion"); |
| auto* arg = op->add_arg(); |
| arg->set_name("fusion_type"); |
| arg->set_i(fusion_type); |
| } |
| |
| bool fuseConvBNAndAffChHelperForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) { |
| auto isAffineChannelNode = [](const repr::NNGraph::NodeRef& node) { |
| if (!repr::nn::is<repr::NeuralNetOperator>(node)) { |
| return false; |
| } |
| auto maybeAffCh = repr::nn::get<repr::NeuralNetOperator>(node); |
| auto maybeAffChDef = getOpDef(*maybeAffCh); |
| return maybeAffChDef.type() == "AffineChannel"; |
| }; |
| |
| for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) { |
| bool no_bias = false; |
| repr::NNGraph::NodeRef convNode; |
| repr::Conv* conv; |
| std::tie(conv, convNode) = node_pair; |
| |
| if (!isOnIdeepDevice(*conv)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| const auto& op = getOpDef(*conv); |
| if (op.type() == "ConvFusion") { |
| continue; |
| } |
| |
| auto convOutput = repr::nn::getOutputs(convNode).front(); |
| auto consumers = repr::nn::getConsumers(convOutput); |
| // convOutput is NOT referenced by sequential ops after BN. |
| if (consumers.size() != 1) { |
| continue; |
| } |
| |
| bool isBN; |
| auto consumer = consumers.front(); |
| if (repr::nn::is<repr::BatchNormalization>(consumer)) { |
| isBN = true; |
| } else if (isAffineChannelNode(consumer)) { |
| isBN = false; |
| } else { |
| continue; |
| } |
| auto bnOrAffChNode = consumer; |
| auto bn = isBN ? repr::nn::get<repr::BatchNormalization>(bnOrAffChNode) : nullptr; |
| auto bnOrAffChOutput = repr::nn::getOutputs(bnOrAffChNode).front(); |
| |
| auto convInputs = repr::nn::getInputs(convNode); |
| if (convInputs.size() < 2) { |
| LOG(WARNING) << "Invalid convolution input size"; |
| continue; |
| } |
| |
| auto bnOrAffChInputs = repr::nn::getInputs(bnOrAffChNode); |
| int numInputs = isBN ? 5 : 3; |
| if (bnOrAffChInputs.size() < numInputs) { |
| LOG(WARNING) << "Invalid input size: " |
| << bnOrAffChInputs.size() |
| << ", expect " << numInputs; |
| continue; |
| } |
| |
| // When no bias, borrow BN bias |
| if (convInputs.size() < 3) { |
| no_bias = true; |
| nn->dataFlow.createEdge(bnOrAffChInputs[2], convNode); |
| convInputs = repr::nn::getInputs(convNode); |
| } |
| |
| #define EXPOSE_TENSOR_DATA(name, index, nodes, need_init) \ |
| itensor* name = nullptr; \ |
| itensor name##Tensor; \ |
| float* name##Data = nullptr; \ |
| if (need_init) { \ |
| name = getTensor<itensor>(getBlob(nodes[index], ws)); \ |
| if (name == nullptr) { \ |
| LOG(WARNING) << #name " not a IDEEP tensor"; \ |
| continue; \ |
| } \ |
| name##Tensor.resize(name->get_dims(), name->get_data_type()); \ |
| name##Tensor.reorder_from(*name); \ |
| CAFFE_ENFORCE( \ |
| name##Tensor.is_public_format(), #name " not with public format"); \ |
| name##Data = static_cast<float*>(name##Tensor.get_data_handle()); \ |
| } |
| |
| EXPOSE_TENSOR_DATA(filter, 1, convInputs, true); |
| EXPOSE_TENSOR_DATA(biasConv, 2, convInputs, true); |
| |
| EXPOSE_TENSOR_DATA(scale, 1, bnOrAffChInputs, true); |
| EXPOSE_TENSOR_DATA(biasBNOrAffCh, 2, bnOrAffChInputs, true); |
| EXPOSE_TENSOR_DATA(mean, 3, bnOrAffChInputs, isBN); |
| EXPOSE_TENSOR_DATA(variance, 4, bnOrAffChInputs, isBN); |
| |
| #undef EXPOSE_TENSOR_DATA |
| |
| // Assume M{CHW,HWC} |
| auto chwDim = filterTensor.get_dim(1) * filterTensor.get_dim(2) * |
| filterTensor.get_dim(3); |
| for (auto c = 0; c < filterTensor.get_dim(0); ++c) { |
| float mean_val = 0; |
| float variance_val = 1; |
| if (isBN) { |
| mean_val = meanData[c]; |
| variance_val = std::sqrt(varianceData[c] + bn->getEpsilon()); |
| } |
| float coeff = scaleData[c] / variance_val; |
| for (auto i = 0; i < chwDim; ++i) { |
| filterData[c * chwDim + i] *= coeff; |
| } |
| |
| if (no_bias) { |
| biasConvData[c] = biasBNOrAffChData[c] - mean_val * coeff; |
| } else { |
| biasConvData[c] = |
| biasBNOrAffChData[c] + (biasConvData[c] - mean_val) * coeff; |
| } |
| } |
| |
| filter->reorder_from(filterTensor); |
| biasConv->reorder_from(biasConvTensor); |
| nn->dataFlow.replaceNode(convOutput, bnOrAffChOutput); |
| |
| nn->dataFlow.deleteNode(bnOrAffChNode); |
| nn->dataFlow.deleteNode(convOutput); |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| void fuseConvBNAndAffChForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) { |
| while (fuseConvBNAndAffChHelperForIdeep(nn, ws)) { |
| } |
| } |
| |
| void fuseConvSumForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) { |
| // Assume the order of nodes from getMutableNodes conforms to |
| // the original topo order of operators |
| auto allNodes = nn->dataFlow.getMutableNodes(); |
| for (int i = 0; i < allNodes.size(); i++) { |
| auto sumNode = allNodes[i]; |
| if (!repr::nn::hasInputs(sumNode)) { |
| continue; |
| } |
| |
| // CAUTION: On IDEEP device, only element-wise Add operator is |
| // supported yet. It totally works as element-wise sum without scalar broadcast. |
| if (!repr::nn::is<repr::Sum>(sumNode) && !isOpType(sumNode, "Add")) { |
| continue; |
| } |
| |
| auto sum = repr::nn::get<repr::NeuralNetOperator>(sumNode); |
| if (!isOnIdeepDevice(*sum)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto sumInputs = repr::nn::getInputs(sumNode); |
| if (sumInputs.size() != 2) { |
| continue; |
| } |
| |
| bool should_fuse = true; |
| for (auto input : sumInputs) { |
| auto consumer = repr::nn::getConsumers(input).back(); |
| if (consumer != sumNode) { |
| should_fuse = false; |
| break; |
| } |
| } |
| // Sum inputs should not be referenced by sequential ops. |
| if (!should_fuse) { |
| continue; |
| } |
| |
| int j = i - 1; |
| repr::NNGraph::NodeRef convNode = nullptr; |
| while (j-- >= 0) { |
| if (!repr::nn::hasInputs(sumNode)) { |
| continue; |
| } |
| |
| // Find the nearest Op before Sum |
| if (repr::nn::is<repr::NeuralNetOperator>(allNodes[j])) { |
| // The Op must be a Conv |
| if (repr::nn::is<repr::Conv>(allNodes[j])) { |
| convNode = allNodes[j]; |
| } |
| break; |
| } |
| } |
| if (convNode == nullptr) { |
| continue; |
| } |
| |
| auto conv = repr::nn::get<repr::Conv>(convNode); |
| if (!shouldFuseConv(*conv)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto convOutput = repr::nn::getOutputs(convNode).front(); |
| repr::NNGraph::NodeRef sumInputX = |
| (sumInputs[0] == convOutput ? sumInputs[1] : sumInputs[0]); |
| CAFFE_ENFORCE(sumInputX != nullptr, "Invalid sum inputs"); |
| |
| auto preNode = repr::nn::getProducer(sumInputX); |
| if (preNode == nullptr || !repr::nn::is<repr::NeuralNetOperator>(preNode)) { |
| LOG(WARNING) << "Can not fuse Conv Sum"; |
| continue; |
| } |
| |
| auto newOutputName = repr::nn::get<repr::Tensor>(sumInputX)->getName(); |
| auto newOutputTensor = util::make_unique<repr::Tensor>(newOutputName); |
| auto newOutput = nn->dataFlow.createNode( |
| unique_dyn_cast<repr::NeuralNetData>(newOutputTensor)); |
| |
| auto sumOutput = repr::nn::getOutputs(sumNode).front(); |
| nn->dataFlow.replaceNode(sumOutput, newOutput); |
| |
| // 2 means FUSION_CONV_SUM |
| resetConvForFusion(convNode, 2); |
| nn->dataFlow.createEdge(sumInputX, convNode); |
| nn->dataFlow.createEdge(convNode, newOutput); |
| |
| nn->dataFlow.deleteNode(sumNode); |
| nn->dataFlow.deleteNode(sumOutput); |
| nn->dataFlow.deleteNode(convOutput); |
| } |
| } |
| |
| void fuseActivationForIdeep(repr::NNModule* nn) { |
| // Conv+Relu fusion |
| auto should_fuse = shouldFuseConv; |
| auto postprocess = std::bind(resetConvForFusion, std::placeholders::_1, 1); |
| fuseActivation<repr::Conv, repr::Relu>(nn, should_fuse, postprocess); |
| } |
| |
| void enforceFusionInplaceForIdeep(repr::NNModule* nn) { |
| // For fusions of Conv+Sum or Conv+Sum+ReLU, the last input and output must |
| // be inplaced. To enforce inplace, here to re-check whole graph and correct |
| // the ConvFusion Ops. |
| for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) { |
| repr::NNGraph::NodeRef convNode; |
| repr::Conv* conv; |
| std::tie(conv, convNode) = node_pair; |
| |
| if (!isOnIdeepDevice(*conv)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| const auto& op = getOpDef(*conv); |
| if (op.type() != "ConvFusion") { |
| continue; |
| } |
| |
| bool enforce_inplace = false; |
| for (const auto& arg : op.arg()) { |
| // Only check FUSION_SUM & FUSION_SUM_RELU |
| if (arg.name() == "fusion_type" && (arg.i() == 2 || arg.i() == 3)) { |
| enforce_inplace = true; |
| break; |
| } |
| } |
| |
| if (!enforce_inplace) { |
| continue; |
| } |
| |
| auto convInput = repr::nn::getInputs(convNode).back(); |
| auto inputName = repr::nn::get<repr::Tensor>(convInput)->getName(); |
| auto convOutput = repr::nn::getOutputs(convNode).front(); |
| auto outputName = repr::nn::get<repr::Tensor>(convOutput)->getName(); |
| if (inputName == outputName) { |
| continue; |
| } |
| |
| auto consumer = repr::nn::getConsumers(convInput).back(); |
| if (consumer != convNode) { |
| LOG(ERROR) << "Can not enforce to inplace for fusion"; |
| return; |
| } |
| |
| auto newOutputTensor = util::make_unique<repr::Tensor>(inputName); |
| auto newOutput = nn->dataFlow.createNode( |
| unique_dyn_cast<repr::NeuralNetData>(newOutputTensor)); |
| nn->dataFlow.replaceNode(convOutput, newOutput); |
| |
| nn->dataFlow.deleteNode(convOutput); |
| } |
| } |
| |
| void setPoolingInferenceMode(repr::NNModule* nn) { |
| for (auto node_pair : repr::nn::dataIterator<repr::MaxPool>(nn->dataFlow)) { |
| repr::NNGraph::NodeRef maxPoolNode; |
| repr::MaxPool* maxPool; |
| std::tie(maxPool, maxPoolNode) = node_pair; |
| |
| if (!isOnIdeepDevice(*maxPool)) { |
| LOG(WARNING) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto* op = getMutableOpDef(*maxPool); |
| bool found_training_mode = false; |
| for (auto& arg : *op->mutable_arg()) { |
| if (arg.name() == "training_mode") { |
| arg.set_i(0); |
| found_training_mode = true; |
| break; |
| } |
| } |
| |
| if (!found_training_mode) { |
| auto* arg = op->add_arg(); |
| arg->set_name("training_mode"); |
| arg->set_i(0); |
| } |
| } |
| } |
| |
| // Pre-convert filters format to expected one here |
| // in order to avoid boring conversions during computations |
| void preConvertFiltersFormat(repr::NNModule* nn, caffe2::Workspace* ws) { |
| for (auto& node : nn->dataFlow.getMutableNodes()) { |
| if (!repr::nn::is<repr::ConvTranspose>(node) && |
| !repr::nn::is<repr::Conv>(node) && !repr::nn::is<repr::FC>(node)) { |
| continue; |
| } |
| |
| auto* nnOp = repr::nn::get<repr::NeuralNetOperator>(node); |
| if (!isOnIdeepDevice(*nnOp)) { |
| LOG(INFO) << "Not a IDEEP operator"; |
| continue; |
| } |
| |
| auto inputs = repr::nn::getInputs(node); |
| if (inputs.size() < 2) { |
| LOG(WARNING) << "Invalid input size"; |
| continue; |
| } |
| |
| auto* filterBlob = getBlob(inputs[1], ws); |
| auto* filter = getMutableTensor<itensor>(filterBlob); |
| if (filter == nullptr) { |
| continue; |
| } |
| |
| itensor::descriptor expectedDesc; |
| if (repr::nn::is<repr::ConvTranspose>(node)) { |
| if (filter->get_public_format() == ideep::format::iohw) |
| continue; |
| auto convTranspose = repr::nn::get<repr::ConvTranspose>(node); |
| auto initValue = [](vector<int>& v, vector<int> i) { |
| if (v.empty()) |
| v = i; |
| }; |
| auto strides = convTranspose->getStrides(); |
| initValue(strides, {1, 1}); |
| auto pads = convTranspose->getPads(); |
| initValue(pads, {0, 0, 0, 0}); |
| auto* op = getMutableOpDef(*convTranspose); |
| auto aalgorithm = ialgo::deconvolution_direct; |
| auto dataType = filter->get_data_type(); |
| ideep::tensor::dims filter_dims_mkldnn{filter->get_dim(1), |
| filter->get_dim(0), |
| filter->get_dim(2), |
| filter->get_dim(3)}; |
| expectedDesc = |
| ideep::convolution_transpose_forward::expected_weights_descriptor( |
| filter_dims_mkldnn, |
| dataType, |
| strides, |
| {pads[0], pads[1]}, |
| {pads[2], pads[3]}); |
| |
| if (filter->get_descriptor() != expectedDesc) { |
| filter->set_public_format(ideep::format::iohw); |
| itensor&& newFilter(expectedDesc); |
| ideep::reorder::compute(*filter, newFilter); |
| newFilter.set_public_format(ideep::format::iohw); |
| filterBlob->Reset<itensor>(new itensor(newFilter)); |
| } |
| } else if (repr::nn::is<repr::Conv>(node)) { |
| auto conv = repr::nn::get<repr::Conv>(node); |
| auto initValue = [](vector<int>& v, vector<int> i) { |
| if (v.empty()) |
| v = i; |
| }; |
| auto strides = conv->getStrides(); |
| initValue(strides, {1, 1}); |
| auto pads = conv->getPads(); |
| initValue(pads, {0, 0, 0, 0}); |
| auto dilations = conv->getDilations(); |
| initValue(dilations, {1, 1}); |
| |
| auto* op = getMutableOpDef(*conv); |
| auto aalgorithm = ialgo::convolution_direct; |
| for (auto& arg : *op->mutable_arg()) { |
| if ((arg.name() == "conv_algorithm") && |
| (arg.i() == CONV_ALGORITHM_WINOGRAD)) { |
| aalgorithm = ialgo::convolution_winograd; |
| } |
| } |
| auto dataType = filter->get_data_type(); |
| |
| filter->make_group(conv->getGroup()); |
| expectedDesc = ideep::convolution_forward::expected_weights_descriptor( |
| filter->get_dims(), |
| dataType, |
| strides, |
| {pads[0], pads[1]}, |
| {pads[2], pads[3]}, |
| dilations, |
| conv->getGroup(), |
| aalgorithm); |
| |
| if (filter->get_descriptor() != expectedDesc) { |
| itensor&& newFilter(expectedDesc); |
| ideep::reorder::compute(*filter, newFilter); |
| filterBlob->Reset<itensor>(new itensor(newFilter)); |
| } |
| // convert weights for FC |
| } else if (repr::nn::is<repr::FC>(node)) { |
| auto fc = repr::nn::get<repr::FC>(node); |
| auto axis_w = fc->getAxisW(); |
| if (axis_w != 1) { |
| auto f_dims = filter->get_dims(); |
| auto f_dim0 = std::accumulate( |
| f_dims.begin(), |
| f_dims.begin() + axis_w, |
| 1, |
| std::multiplies<itensor::dim_t>()); |
| auto f_dim1 = std::accumulate( |
| f_dims.begin() + axis_w, |
| f_dims.end(), |
| 1, |
| std::multiplies<itensor::dim_t>()); |
| filter->reshape({f_dim0, f_dim1}); |
| } |
| |
| expectedDesc = ideep::inner_product_forward::expected_weights_descriptor( |
| filter->get_dims()); |
| |
| if (filter->get_descriptor() != expectedDesc) { |
| itensor&& newFilter(expectedDesc); |
| ideep::reorder::compute(filter->as_weights(), newFilter); |
| filterBlob->Reset<itensor>(new itensor(newFilter)); |
| } |
| } |
| } |
| } |
| |
| void OptimizeForMkldnn(repr::NNModule *nn, caffe2::Workspace *ws, |
| bool training_mode) { |
| if (training_mode) { |
| preConvertFiltersFormat(nn, ws); |
| return; |
| } |
| |
| removeStopGradientForInference(nn); |
| |
| fuseConvBNAndAffChForIdeep(nn, ws); |
| |
| fuseConvSumForIdeep(nn, ws); |
| |
| fuseActivationForIdeep(nn); |
| |
| enforceFusionInplaceForIdeep(nn); |
| |
| setPoolingInferenceMode(nn); |
| } |
| |
| #endif // CAFFE2_USE_MKLDNN |
| |
| } // namespace opt |
| } // namespace caffe2 |