caffe2/opt/glow_net_transform.cc - platform/external/pytorch - Git at Google

 #include "glow_net_transform.h"

 #include <caffe2/opt/onnxifi_transformer.h>
 #include <caffe2/opt/shape_info.h>
 #include <caffe2/utils/string_utils.h>

 #include <unordered_set>

 C10_DEFINE_bool(onnxifi_debug_mode, false, "Enable onnxifi debug mode.");

 C10_DEFINE_bool(
     onnxifi_adjust_batch,
     true,
     "Attach AdjustBatch ops at input/outputs of the Onnxifi ops");

 C10_DEFINE_bool(
     enforce_fp32_inputs_into_fp16,
     false,
     "Whether to enforce fp32 to fp16 conversion for external inputs.");

 C10_DEFINE_bool(
     merge_fp32_inputs_into_fp16,
     false,
     "Merge all the fp32 input tensors into one, convert it to fp16 and split it back");

 C10_DEFINE_bool(
     verify_only_single_subnet,
     false,
     "Check that only one subnet is created during Onnxifi."
 )

 C10_DEFINE_int32(
     onnxifi_min_ops,
     1,
     "Minimum number of ops for a subgraph to be lowered to backend");

 C10_DEFINE_int32(
     onnxifi_timeout_ms,
     0,
     "Timeout limit for onnxifi inference in milliseconds. 0 means no timeout");

 C10_DEFINE_string(
     onnxifi_shape_hints,
     "",
     "Shape hints in the form of Name:d0,d1:d2;");

 C10_DEFINE_string(
     onnxifi_blacklist,
     "",
     "A list of net positions whose corresponding op will be ignored "
     "to onnxifi. Example 0-50,61,62-70");

 C10_DEFINE_string(
     onnxifi_blacklist_ops,
     "",
     "A list of operator types that will be ignored "
     "to onnxifi. Example Tanh,Mul");

 C10_DEFINE_string(
     onnxifi_input_output_observe_list,
     "",
     "A list of net positions whose corresponding op's inputs and outputs will be"
     " observed. ");

 C10_DEFINE_bool(
     use_onnxifi_batch_size,
     true,
     "If true then instead of nominal batch blob for determining current batch "
     "size we would use batch size provided as part of Glow request data.");

 namespace caffe2 {
 namespace glow {

 // The list in the form of "0-3,5,6-7" which means, we will black list ops
 // with net positions in [0,1,2,3,5,6,7]
 std::unordered_set<int> ParseNetPositionList(const std::string& str) {
   std::unordered_set<int> net_position_list;
   if (str.empty()) {
     return net_position_list;
   }
   auto tokens = caffe2::split(',', str);
   for (const auto& token : tokens) {
     if (token == "-1") {
       net_position_list.emplace(-1);
       continue;
     }
     auto range = caffe2::split('-', token);
     if (range.size() == 1) {
       net_position_list.emplace(std::stoi(range[0]));
     } else if (range.size() == 2) {
       int from = std::stoi(range[0]);
       int to = std::stoi(range[1]);
       for (int i = from; i <= to; ++i) {
         net_position_list.emplace(i);
       }
     } else if (range.size() > 2) {
       LOG(WARNING) << "Ignoring illegal range: " << token;
     }
   }
   return net_position_list;
 }

 std::unordered_set<std::string> ParseBlockListOps(const std::string& str) {
   std::unordered_set<std::string> ops;
   if (str.empty()) {
     return ops;
   }
   auto tokens = caffe2::split(',', str);
   for (const auto& token : tokens) {
     ops.emplace(token);
   }
   return ops;
 }

 // Carrying out the ONNXIFI transform
 void onnxifi(
     NetDef* net,
     Workspace* ws,
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
     const std::unordered_set<int>& blocklist,
     const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size,
     size_t max_seq_size,
     bool load_model_by_blob,
     bool predictor_net_ssa_rewritten,
     const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs,
     const c10::optional<std::string> &blacklist_ops,
     const c10::optional<size_t> &min_ops,
     const std::unordered_set<std::string> &blocklist_blobs,
     const c10::optional<bool> &verify_only_single_subnet) {
   // Split SparseLengthsSumSparse so that we can lower the SparseLengthsSum part
   splitSparseLengthsSumSparse(net, *ws);

   // Clean up the external input/output of the net
   net->mutable_external_input()->Clear();
   net->mutable_external_output()->Clear();
   for (const auto& i : input_names) {
     net->add_external_input(i);
   }
   for (const auto& w : weight_names) {
     net->add_external_input(w);
   }
   for (const auto& o : output_names) {
     net->add_external_output(o);
   }

   // ONNXIFI transform
   OnnxifiTransformerOptions opts;
   opts.use_onnx = use_onnx;
   opts.bound_shape_spec.max_batch_size = max_batch_size;
   opts.bound_shape_spec.max_seq_size = max_seq_size;
   opts.debug = FLAGS_onnxifi_debug_mode;
   opts.adjust_batch = FLAGS_onnxifi_adjust_batch;
   opts.min_ops = min_ops.value_or(FLAGS_onnxifi_min_ops);
   opts.load_model_by_blob = load_model_by_blob;
   opts.enforce_fp32_inputs_into_fp16 = FLAGS_enforce_fp32_inputs_into_fp16;
   opts.merge_fp32_inputs_into_fp16 = FLAGS_merge_fp32_inputs_into_fp16;
   opts.verify_only_single_subnet = verify_only_single_subnet.value_or(FLAGS_verify_only_single_subnet);
   opts.predictor_net_ssa_rewritten = predictor_net_ssa_rewritten;
   opts.timeout = FLAGS_onnxifi_timeout_ms;
   opts.shape_hints_per_bs = shape_hints_per_bs;
   opts.use_onnxifi_batch_size = FLAGS_use_onnxifi_batch_size;

   ShapeInfoMap more_shape_hints = shape_hints_max_bs;
   if (!FLAGS_onnxifi_shape_hints.empty()) {
     parseShapeInfoMapFromString(FLAGS_onnxifi_shape_hints, more_shape_hints);
   }

   // Before applying backlist, make sure the ops in the net all have an net_pos;
   caffe2::BackendTransformerBase::annotateOpIndex(net);

   // Parse the blocklist
   auto more_blocklist = ParseNetPositionList(FLAGS_onnxifi_blacklist);
   for (const auto& b : blocklist) {
     more_blocklist.emplace(b);
   }

   // ONNX mode will change the op order so it doesn't apply here
   if (!opts.use_onnx) {
     auto blocklisted_ops = ParseBlockListOps(blacklist_ops.value_or(FLAGS_onnxifi_blacklist_ops));
     for (const auto& op : net->op()) {
       if (blocklisted_ops.count(op.type())) {
         ArgumentHelper helper(op);
         more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
       }
     }
   }
   // exclude blocklisted blobs, which is supposed to be loaded to NVM selectively.
   for (const auto& op : net->op()) {
     if (blocklist_blobs.count(op.input(0))) {
       ArgumentHelper helper(op);
       more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
     }
   }

   // Attach observation nodes
   //
   // When we want to observe intermediate tensors value out of the onnxifi op,
   // we use the following trick:
   //
   // 1. for specified op, we find its input and outputs.
   // 2. for each input and output, we create a new copy op and attach it as an
   // input to the copy.
   // 3. we blocklist these new copy operators from onnxification. This forces
   // these intermediate tensors to also become outputs of the onnxifi op.
   // 4. we put the right arguments on the copy ops so TensorObserver can print
   // out the values.
   auto ops_to_observe =
       ParseNetPositionList(FLAGS_onnxifi_input_output_observe_list);
   std::unordered_set<std::string> tensors_to_observe;
   for (const auto& op : ops_to_observe) {
     if (op >= net->op().size()) {
       CAFFE_THROW(
           "Cannot observe operator at position ", op, " (out of range)");
     }
     const auto& op_to_observe = net->op(op);
     tensors_to_observe.insert(
         op_to_observe.input().begin(), op_to_observe.input().end());

     if ((op_to_observe.type() == "Concat" ||
          op_to_observe.type() == "Reshape") &&
         op_to_observe.output().size() == 2) {
       tensors_to_observe.insert(op_to_observe.output(0));
     } else {
       tensors_to_observe.insert(
           op_to_observe.output().begin(), op_to_observe.output().end());
     }
   }
   for (const auto& tensor : tensors_to_observe) {
     OperatorDef copy_op;
     copy_op.set_type("Copy");
     copy_op.add_input(tensor);
     copy_op.add_output(tensor + "_copy_output_ignore");
     auto pos = net->op().size();
     AddArgument(kNetPos, pos, &copy_op);
     AddArgument("observe_input_tensors", 1, &copy_op);
     net->add_op()->CopyFrom(copy_op);
     more_blocklist.emplace(pos);
   }

   OnnxifiTransformer ts(opts);
   ts.transform(ws, net, weight_names, more_shape_hints, more_blocklist);

   // Cleanup the input from the workspace
   for (const auto& i : input_names) {
     ws->RemoveBlob(i);
   }
 }

 } // namespace glow
 } // namespace caffe2
	#include "glow_net_transform.h"

	#include <caffe2/opt/onnxifi_transformer.h>
	#include <caffe2/opt/shape_info.h>
	#include <caffe2/utils/string_utils.h>

	#include <unordered_set>

	C10_DEFINE_bool(onnxifi_debug_mode, false, "Enable onnxifi debug mode.");

	C10_DEFINE_bool(
	onnxifi_adjust_batch,
	true,
	"Attach AdjustBatch ops at input/outputs of the Onnxifi ops");

	C10_DEFINE_bool(
	enforce_fp32_inputs_into_fp16,
	false,
	"Whether to enforce fp32 to fp16 conversion for external inputs.");

	C10_DEFINE_bool(
	merge_fp32_inputs_into_fp16,
	false,
	"Merge all the fp32 input tensors into one, convert it to fp16 and split it back");

	C10_DEFINE_bool(
	verify_only_single_subnet,
	false,
	"Check that only one subnet is created during Onnxifi."
	)

	C10_DEFINE_int32(
	onnxifi_min_ops,
	1,
	"Minimum number of ops for a subgraph to be lowered to backend");

	C10_DEFINE_int32(
	onnxifi_timeout_ms,
	0,
	"Timeout limit for onnxifi inference in milliseconds. 0 means no timeout");

	C10_DEFINE_string(
	onnxifi_shape_hints,
	"",
	"Shape hints in the form of Name:d0,d1:d2;");

	C10_DEFINE_string(
	onnxifi_blacklist,
	"",
	"A list of net positions whose corresponding op will be ignored "
	"to onnxifi. Example 0-50,61,62-70");

	C10_DEFINE_string(
	onnxifi_blacklist_ops,
	"",
	"A list of operator types that will be ignored "
	"to onnxifi. Example Tanh,Mul");

	C10_DEFINE_string(
	onnxifi_input_output_observe_list,
	"",
	"A list of net positions whose corresponding op's inputs and outputs will be"
	" observed. ");

	C10_DEFINE_bool(
	use_onnxifi_batch_size,
	true,
	"If true then instead of nominal batch blob for determining current batch "
	"size we would use batch size provided as part of Glow request data.");

	namespace caffe2 {
	namespace glow {

	// The list in the form of "0-3,5,6-7" which means, we will black list ops
	// with net positions in [0,1,2,3,5,6,7]
	std::unordered_set<int> ParseNetPositionList(const std::string& str) {
	std::unordered_set<int> net_position_list;
	if (str.empty()) {
	return net_position_list;
	}
	auto tokens = caffe2::split(',', str);
	for (const auto& token : tokens) {
	if (token == "-1") {
	net_position_list.emplace(-1);
	continue;
	}
	auto range = caffe2::split('-', token);
	if (range.size() == 1) {
	net_position_list.emplace(std::stoi(range[0]));
	} else if (range.size() == 2) {
	int from = std::stoi(range[0]);
	int to = std::stoi(range[1]);
	for (int i = from; i <= to; ++i) {
	net_position_list.emplace(i);
	}
	} else if (range.size() > 2) {
	LOG(WARNING) << "Ignoring illegal range: " << token;
	}
	}
	return net_position_list;
	}

	std::unordered_set<std::string> ParseBlockListOps(const std::string& str) {
	std::unordered_set<std::string> ops;
	if (str.empty()) {
	return ops;
	}
	auto tokens = caffe2::split(',', str);
	for (const auto& token : tokens) {
	ops.emplace(token);
	}
	return ops;
	}

	// Carrying out the ONNXIFI transform
	void onnxifi(
	NetDef* net,
	Workspace* ws,
	const std::vector<std::string>& input_names,
	const std::vector<std::string>& output_names,
	const std::vector<std::string>& weight_names,
	const std::unordered_set<int>& blocklist,
	const ShapeInfoMap& shape_hints_max_bs,
	bool use_onnx,
	size_t max_batch_size,
	size_t max_seq_size,
	bool load_model_by_blob,
	bool predictor_net_ssa_rewritten,
	const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs,
	const c10::optional<std::string> &blacklist_ops,
	const c10::optional<size_t> &min_ops,
	const std::unordered_set<std::string> &blocklist_blobs,
	const c10::optional<bool> &verify_only_single_subnet) {
	// Split SparseLengthsSumSparse so that we can lower the SparseLengthsSum part
	splitSparseLengthsSumSparse(net, *ws);

	// Clean up the external input/output of the net
	net->mutable_external_input()->Clear();
	net->mutable_external_output()->Clear();
	for (const auto& i : input_names) {
	net->add_external_input(i);
	}
	for (const auto& w : weight_names) {
	net->add_external_input(w);
	}
	for (const auto& o : output_names) {
	net->add_external_output(o);
	}

	// ONNXIFI transform
	OnnxifiTransformerOptions opts;
	opts.use_onnx = use_onnx;
	opts.bound_shape_spec.max_batch_size = max_batch_size;
	opts.bound_shape_spec.max_seq_size = max_seq_size;
	opts.debug = FLAGS_onnxifi_debug_mode;
	opts.adjust_batch = FLAGS_onnxifi_adjust_batch;
	opts.min_ops = min_ops.value_or(FLAGS_onnxifi_min_ops);
	opts.load_model_by_blob = load_model_by_blob;
	opts.enforce_fp32_inputs_into_fp16 = FLAGS_enforce_fp32_inputs_into_fp16;
	opts.merge_fp32_inputs_into_fp16 = FLAGS_merge_fp32_inputs_into_fp16;
	opts.verify_only_single_subnet = verify_only_single_subnet.value_or(FLAGS_verify_only_single_subnet);
	opts.predictor_net_ssa_rewritten = predictor_net_ssa_rewritten;
	opts.timeout = FLAGS_onnxifi_timeout_ms;
	opts.shape_hints_per_bs = shape_hints_per_bs;
	opts.use_onnxifi_batch_size = FLAGS_use_onnxifi_batch_size;

	ShapeInfoMap more_shape_hints = shape_hints_max_bs;
	if (!FLAGS_onnxifi_shape_hints.empty()) {
	parseShapeInfoMapFromString(FLAGS_onnxifi_shape_hints, more_shape_hints);
	}

	// Before applying backlist, make sure the ops in the net all have an net_pos;
	caffe2::BackendTransformerBase::annotateOpIndex(net);

	// Parse the blocklist
	auto more_blocklist = ParseNetPositionList(FLAGS_onnxifi_blacklist);
	for (const auto& b : blocklist) {
	more_blocklist.emplace(b);
	}

	// ONNX mode will change the op order so it doesn't apply here
	if (!opts.use_onnx) {
	auto blocklisted_ops = ParseBlockListOps(blacklist_ops.value_or(FLAGS_onnxifi_blacklist_ops));
	for (const auto& op : net->op()) {
	if (blocklisted_ops.count(op.type())) {
	ArgumentHelper helper(op);
	more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
	}
	}
	}
	// exclude blocklisted blobs, which is supposed to be loaded to NVM selectively.
	for (const auto& op : net->op()) {
	if (blocklist_blobs.count(op.input(0))) {
	ArgumentHelper helper(op);
	more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
	}
	}

	// Attach observation nodes
	//
	// When we want to observe intermediate tensors value out of the onnxifi op,
	// we use the following trick:
	//
	// 1. for specified op, we find its input and outputs.
	// 2. for each input and output, we create a new copy op and attach it as an
	// input to the copy.
	// 3. we blocklist these new copy operators from onnxification. This forces
	// these intermediate tensors to also become outputs of the onnxifi op.
	// 4. we put the right arguments on the copy ops so TensorObserver can print
	// out the values.
	auto ops_to_observe =
	ParseNetPositionList(FLAGS_onnxifi_input_output_observe_list);
	std::unordered_set<std::string> tensors_to_observe;
	for (const auto& op : ops_to_observe) {
	if (op >= net->op().size()) {
	CAFFE_THROW(
	"Cannot observe operator at position ", op, " (out of range)");
	}
	const auto& op_to_observe = net->op(op);
	tensors_to_observe.insert(
	op_to_observe.input().begin(), op_to_observe.input().end());

	if ((op_to_observe.type() == "Concat" \|\|
	op_to_observe.type() == "Reshape") &&
	op_to_observe.output().size() == 2) {
	tensors_to_observe.insert(op_to_observe.output(0));
	} else {
	tensors_to_observe.insert(
	op_to_observe.output().begin(), op_to_observe.output().end());
	}
	}
	for (const auto& tensor : tensors_to_observe) {
	OperatorDef copy_op;
	copy_op.set_type("Copy");
	copy_op.add_input(tensor);
	copy_op.add_output(tensor + "_copy_output_ignore");
	auto pos = net->op().size();
	AddArgument(kNetPos, pos, &copy_op);
	AddArgument("observe_input_tensors", 1, &copy_op);
	net->add_op()->CopyFrom(copy_op);
	more_blocklist.emplace(pos);
	}

	OnnxifiTransformer ts(opts);
	ts.transform(ws, net, weight_names, more_shape_hints, more_blocklist);

	// Cleanup the input from the workspace
	for (const auto& i : input_names) {
	ws->RemoveBlob(i);
	}
	}

	} // namespace glow
	} // namespace caffe2