Back out "Revert D30599136: [Pytorch Edge][tracing-based] build tracer in OSS" (#66267)

Summary:
Previously https://github.com/pytorch/pytorch/pull/64087 broke the  test `binary_macos_wheel_3_7_cpu_build`, because wheel build is not happy with `model_tracer`. Considering it's prototype and there is no need to ship model_tracer via wheel at the moment, using the option `TRACING_BASED` for building tracer. When tracing-based is mature enough, we can ship the tracer binary via wheel eventually.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/66267

Original commit changeset: 8ac3d75a52d0
ghstack-source-id: 140122106

Test Plan:
binary_macos_wheel_3_7_cpu_build passes

{F668643831}

Reviewed By: dhruvbird

Differential Revision: D31478593

fbshipit-source-id: 726cab1b31c4596f6268b7824eecb20e2e59d161
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04b32ba..a0f1d0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -427,6 +427,10 @@
     "Path to the yaml file that contains the op dependency graph for custom build.")
 set(STATIC_DISPATCH_BACKEND "" CACHE STRING
     "Name of the backend for which static dispatch code is generated, e.g.: CPU.")
+option(
+  TRACING_BASED
+  "Master flag to build Lite Interpreter with tracing build option"
+  OFF)
 
 # This is a fix for a rare build issue on Ubuntu:
 # symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4b3330e..e37e333 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1009,6 +1009,14 @@
   add_dependencies(torch_cpu Caffe2_PROTO)
 endif()
 
+# Build model tracer for tracing-based selective build
+if(TRACING_BASED AND NOT BUILD_LITE_INTERPRETER AND NOT INTERN_BUILD_MOBILE)
+  add_subdirectory(
+    ${TORCH_ROOT}/torch/csrc/jit/mobile/model_tracer
+    ${CMAKE_BINARY_DIR}/model_tracer
+  )
+endif()
+
 # Codegen selected_mobile_ops.h for template selective build
 if(BUILD_LITE_INTERPRETER AND SELECTED_OP_LIST)
   add_custom_command(
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index e25c3a7..fe4ab30 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -444,6 +444,14 @@
 
 libtorch_core_jit_sources = sorted(jit_sources_full)
 
+torch_mobile_tracer_sources = [
+    "torch/csrc/jit/mobile/model_tracer/tracer.cpp",
+    "torch/csrc/jit/mobile/model_tracer/TensorUtils.cpp",
+    "torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp",
+    "torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp",
+    "torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.cpp",
+]
+
 torch_mobile_core = [
     # backend_debug_info.cpp provides
     # __torch__.torch.classes.backend.BackendDebugInfo class
diff --git a/torch/csrc/jit/mobile/model_tracer/CMakeLists.txt b/torch/csrc/jit/mobile/model_tracer/CMakeLists.txt
new file mode 100644
index 0000000..678f3f4
--- /dev/null
+++ b/torch/csrc/jit/mobile/model_tracer/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(
+  MODEL_TRACER_DIR
+  "${TORCH_ROOT}/torch/csrc/jit/mobile/model_tracer")
+
+list(APPEND MODEL_TRACER_SOURCES "")
+
+append_filelist("torch_mobile_tracer_sources" MODEL_TRACER_SOURCES)
+
+add_executable(
+  model_tracer
+  ${MODEL_TRACER_SOURCES})
+
+target_link_libraries(model_tracer PRIVATE torch)
+
+install(TARGETS model_tracer DESTINATION bin)
diff --git a/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp
new file mode 100644
index 0000000..7e2930a
--- /dev/null
+++ b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp
@@ -0,0 +1,248 @@
+#include <torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h>
+#include <torch/csrc/jit/mobile/model_tracer/TensorUtils.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+std::vector<std::vector<at::IValue>> MobileModelRunner::
+    ivalue_to_bundled_inputs(const c10::IValue& bundled_inputs) {
+  CAFFE_ENFORCE(
+      bundled_inputs.isList(),
+      "Expected get_all_bundled_inputs to ",
+      "return a list but got a ",
+      bundled_inputs.tagKind(),
+      " instead");
+
+  c10::List<at::IValue> all_inputs = bundled_inputs.toList();
+  CAFFE_ENFORCE(
+      !all_inputs.empty(),
+      "Expected at least 1 bundled input, ",
+      "but found none. Please use ",
+      "torch.utils.bundled_inputs.augment_model_with_bundled_inputs to add.");
+
+  std::vector<std::vector<at::IValue>> ret;
+  for (at::IValue input : all_inputs) {
+    CAFFE_ENFORCE(
+        input.isTuple(),
+        "Expected list element to be a tuple ",
+        "but got a ",
+        input.tagKind(),
+        " instead");
+    ret.push_back(input.toTuple()->elements());
+  }
+
+  return ret;
+}
+
+std::unordered_map<std::string, std::string> MobileModelRunner::
+    ivalue_to_bundled_inputs_map(const c10::IValue& bundled_inputs) {
+  CAFFE_ENFORCE(
+      bundled_inputs.isGenericDict(),
+      "Expected get_bundled_inputs_functions_and_info to ",
+      "return a dict but got a ",
+      bundled_inputs.tagKind(),
+      " instead");
+
+  c10::Dict<at::IValue, at::IValue> all_inputs = bundled_inputs.toGenericDict();
+  CAFFE_ENFORCE(
+      !all_inputs.empty(),
+      "Expected at least 1 function with bundled inputs, ",
+      "but found none. Please use ",
+      "torch.utils.bundled_inputs.augment_model_with_bundled_inputs to add.");
+
+  std::unordered_map<std::string, std::string> ret;
+  for (auto& input : all_inputs) {
+    at::IValue function_name = input.key();
+    at::IValue nested_dict = input.value();
+    CAFFE_ENFORCE(
+        function_name.isString(),
+        "Expected function with inputs to be a string ",
+        "but got a ",
+        function_name.tagKind(),
+        " instead");
+    CAFFE_ENFORCE(
+        nested_dict.isGenericDict(),
+        "Expected function name to map to dictionary ",
+        "but got a ",
+        nested_dict.tagKind(),
+        " instead");
+
+    // Got the nested dict now need to convert that into std types
+    c10::Dict<at::IValue, at::IValue> function_and_info_ival_dict =
+        nested_dict.toGenericDict();
+    std::unordered_map<std::string, std::vector<std::string>>
+        function_and_info_dict;
+    for (auto& entry : function_and_info_ival_dict) {
+      at::IValue key = entry.key();
+      at::IValue value = entry.value();
+      CAFFE_ENFORCE(
+          key.isString(),
+          "Expected extra information key to be a string ",
+          "but got a ",
+          value.tagKind(),
+          " instead");
+      CAFFE_ENFORCE(
+          value.isList(),
+          "Expected extra information values to be a list ",
+          "but got a ",
+          value.tagKind(),
+          " instead");
+
+      // Got the value of the nested dict entry now need to convert it to std
+      // types
+      std::vector<std::string> data_list;
+      c10::List<at::IValue> ival_data = value.toList();
+      for (at::IValue data : ival_data) {
+        CAFFE_ENFORCE(
+            data.isString(),
+            "Expected list element of nested dict entries to be a string ",
+            "but got a ",
+            data.tagKind(),
+            " instead");
+        data_list.push_back(data.toStringRef());
+      }
+
+      // Add entry into std type mapping
+      function_and_info_dict[key.toStringRef()] = data_list;
+    }
+
+    // Could store the full mapping of std types, but the 'info' section isnt
+    // needed here
+    std::string input_function =
+        function_and_info_dict["get_inputs_function_name"][0];
+    ret[function_name.toStringRef()] = input_function;
+  }
+
+  return ret;
+}
+
+std::vector<std::vector<at::IValue>> MobileModelRunner::
+    get_all_bundled_inputs() {
+  auto has_bundled_input = module_->find_method("get_all_bundled_inputs");
+  CAFFE_ENFORCE(
+      has_bundled_input,
+      "Model does not have bundled inputs. ",
+      "Use torch.utils.bundled_inputs.augment_model_with_bundled_inputs to add.");
+
+  c10::IValue bundled_inputs = module_->run_method("get_all_bundled_inputs");
+  return ivalue_to_bundled_inputs(bundled_inputs);
+}
+
+std::unordered_map<std::string, std::vector<std::vector<at::IValue>>>
+MobileModelRunner::get_many_functions_bundled_inputs() {
+  auto has_bundled_input =
+      module_->find_method("get_bundled_inputs_functions_and_info");
+  CAFFE_ENFORCE(
+      has_bundled_input,
+      "Model does not have bundled inputs. ",
+      "Use torch.utils.bundled_inputs.augment_many_model_functions_with_bundled_inputs to add.");
+
+  auto ival_bundled_inputs_mapping =
+      module_->run_method("get_bundled_inputs_functions_and_info");
+  auto bundled_inputs_mapping =
+      ivalue_to_bundled_inputs_map(ival_bundled_inputs_mapping);
+
+  std::unordered_map<std::string, std::vector<std::vector<at::IValue>>> ret;
+
+  for (auto& entry : bundled_inputs_mapping) {
+    std::string function_name = entry.first;
+    std::string function_to_call = entry.second;
+
+    auto has_func_to_call = module_->find_method(function_to_call);
+    CAFFE_ENFORCE(
+        has_func_to_call,
+        "Model does not have ",
+        function_to_call,
+        "Use torch.utils.bundled_inputs.augment_many_model_functions_with_bundled_inputs to add.");
+
+    c10::IValue bundled_inputs = module_->run_method(function_to_call);
+    ret[function_name] = ivalue_to_bundled_inputs(bundled_inputs);
+  }
+  return ret;
+}
+
+std::vector<at::IValue> MobileModelRunner::run_with_inputs(
+    std::vector<std::vector<at::IValue>> const& bundled_inputs) {
+  std::vector<at::IValue> ret;
+  ret.reserve(bundled_inputs.size());
+  for (std::vector<at::IValue> const& input : bundled_inputs) {
+    ret.emplace_back(module_->forward(input));
+  }
+  return ret;
+}
+
+std::vector<at::IValue> MobileModelRunner::run_with_inputs(
+    const std::string& function_name,
+    std::vector<std::vector<at::IValue>> const& bundled_inputs) const {
+  std::vector<at::IValue> ret;
+  ret.reserve(bundled_inputs.size());
+  auto has_bundled_input = module_->find_method(function_name);
+  CAFFE_ENFORCE(
+      has_bundled_input,
+      "Model does not have the method named ",
+      function_name,
+      "Please ensure that it was exported correctly");
+  for (std::vector<at::IValue> const& input : bundled_inputs) {
+    auto func = module_->get_method(function_name);
+    ret.emplace_back(func(input));
+  }
+  return ret;
+}
+
+void MobileModelRunner::run_argless_functions(
+    const std::vector<std::string>& functions) {
+  for (auto& function_name : functions) {
+    if (module_->find_method(function_name)) {
+      module_->run_method(function_name);
+    }
+  }
+}
+
+std::string MobileModelRunner::get_extra_file_contents(
+    std::string const& file_path,
+    std::string const& extra_file_name) {
+  std::unordered_map<std::string, std::string> extra;
+  extra[extra_file_name] = "";
+  torch::jit::_load_extra_only_for_mobile(file_path, c10::nullopt, extra);
+  return extra[extra_file_name];
+}
+
+bool MobileModelRunner::set_has_metal_gpu_operators(
+    std::set<std::string> const& op_list) {
+  for (std::string const& op : op_list) {
+    if (op.find("metal::") == 0) {
+      return true;
+    }
+    if (op.find("metal_prepack_unet::") == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void MobileModelRunner::for_each_tensor_in_bundled_inputs(
+    std::function<void(const ::at::Tensor&)> const& func) {
+  if (has_new_style_bundled_inputs()) {
+    // Get the bundled inputs and access the arg level ivalues stored within
+    auto bundled_inputs_mapping = this->get_many_functions_bundled_inputs();
+
+    // Loop over functions
+    for (auto& entry : bundled_inputs_mapping) {
+      std::vector<std::vector<at::IValue>> bundled_inputs = entry.second;
+      // Loop through inputs
+      for (const std::vector<at::IValue>& input : bundled_inputs) {
+        // Loop through values in an input
+        for (const at::IValue& iv : input) {
+          for_each_tensor_in_ivalue(iv, func);
+        }
+      }
+    }
+  } else {
+    c10::IValue iv = module_->run_method("get_all_bundled_inputs");
+    for_each_tensor_in_ivalue(iv, func);
+  }
+}
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h
new file mode 100644
index 0000000..5f394be
--- /dev/null
+++ b/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/serialization/export.h>
+#include <torch/script.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+class MobileModelRunner {
+  std::shared_ptr<torch::jit::mobile::Module> module_;
+
+ public:
+  explicit MobileModelRunner(std::string const& file_path) {
+    module_ = std::make_shared<torch::jit::mobile::Module>(
+        torch::jit::_load_for_mobile(file_path));
+  }
+
+  MobileModelRunner(
+      std::string const& file_path,
+      uint64_t module_load_options) {
+    std::unordered_map<std::string, std::string> extra_files;
+    module_ = std::make_shared<torch::jit::mobile::Module>(
+        torch::jit::_load_for_mobile(
+            file_path,
+            at::Device(at::DeviceType::CPU, 0),
+            extra_files,
+            module_load_options));
+  }
+
+  /**
+   * Fetches the contents of the file named "extra/" + extra_file_name from the
+   * .ptl archive at location file_path. The contents are returned as an
+   * std::string.
+   *
+   * An empty string is returned if the file at the location "extra/" +
+   * extra_file_name does not exist or is an empty file (within the .ptl
+   * archive).
+   *
+   * An exception is thrown if the .ptl file at location file_path does not
+   * exist.
+   *
+   */
+  static std::string get_extra_file_contents(
+      std::string const& file_path,
+      std::string const& extra_file_name);
+
+  /**
+   * Returns true if the list of operators passed in has a Metal GPU operator,
+   * and false otherwise.
+   *
+   */
+  static bool set_has_metal_gpu_operators(std::set<std::string> const& op_list);
+
+  /**
+   * Fetches the set of root operators in the file "extra/mobile_info.json"
+   * within the .ptl archive at location file_path.
+   *
+   * An exception is thrown if:
+   *
+   * 1. The file at file_path does not exist, or
+   * 2. The contents of extra/mobile_info.json is not a JSON, or
+   * 3. The file extra/mobile_info.json does not exist, or
+   * 4. The JSON is malformed in some way and the operator list can not be
+   * extracted correctly.
+   *
+   */
+  static std::set<std::string> get_operators_from_mobile_info_json(
+      std::string const& file_path);
+
+  static std::vector<std::vector<at::IValue>> ivalue_to_bundled_inputs(
+      const c10::IValue& bundled_inputs);
+
+  static std::unordered_map<std::string, std::string>
+  ivalue_to_bundled_inputs_map(const c10::IValue& bundled_inputs);
+
+  /**
+   * Fetches all the bundled inputs of the loaded mobile model.
+   *
+   * A bundled input itself is of type std::vector<at::IValue> and the
+   * elements of this vector<> are the arguments that the "forward"
+   * method of the model accepts. i.e. each of the at::IValue is a
+   * single argument to the model's "forward" method.
+   *
+   * The outer vector holds a bundled input. For models with bundled
+   * inputs, the outer most vector will have size > 0.
+   */
+  std::vector<std::vector<at::IValue>> get_all_bundled_inputs();
+
+  /**
+   * Fetches all the bundled inputs for all functions of the loaded mobile
+   * model.
+   *
+   * The mapping is from 'function_names' eg 'forward' to bundled inputs for
+   * that function
+   *
+   * A bundled input itself is of type std::vector<at::IValue> and the
+   * elements of this vector<> are the arguments that the corresponding
+   * method of the model accepts. i.e. each of the at::IValue in the entry
+   * for forward is a single argument to the model's "forward" method.
+   *
+   * The outer vector of each value holds a bundled input. For models with
+   * bundled inputs, the outer most vector will have size > 0.
+   */
+  std::unordered_map<std::string, std::vector<std::vector<at::IValue>>>
+  get_many_functions_bundled_inputs();
+
+  /**
+   * Returns true if a model possesses get_bundled_inputs_functions_and_info()
+   */
+  bool has_new_style_bundled_inputs() const {
+    return module_->find_method("get_bundled_inputs_functions_and_info") !=
+        c10::nullopt;
+  }
+
+  /**
+   * For each tensor in bundled inputs, call the user-provided function 'func'.
+   */
+  void for_each_tensor_in_bundled_inputs(
+      std::function<void(const ::at::Tensor&)> const& func);
+
+  /**
+   * Get the root operators directly called by this model's Bytecode.
+   */
+  std::set<std::string> get_root_operators() {
+    return torch::jit::mobile::_export_operator_list(*module_);
+  }
+
+  /**
+   * Runs the model against all of the provided inputs using the model's
+   * "forward" method. Returns an std::vector<at::IValue>, where each element
+   * of the returned vector is one of the return values from calling forward().
+   */
+  std::vector<at::IValue> run_with_inputs(
+      std::vector<std::vector<at::IValue>> const& bundled_inputs);
+
+  /**
+   * Runs the model against all of the provided inputs for all the specified
+   * function. Returns an std::vector<at::IValue>, where each element
+   * of the returned vector is one of the return values from calling the
+   * method named "function_name" on this model.
+   */
+  std::vector<at::IValue> run_with_inputs(
+      const std::string& function_name,
+      std::vector<std::vector<at::IValue>> const& bundled_inputs) const;
+
+  /**
+   * Attempts to run all functions in the passed in list if they exist. All
+   * funcs should require no args
+   */
+  void run_argless_functions(const std::vector<std::string>& functions);
+};
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/model_tracer/TensorUtils.cpp b/torch/csrc/jit/mobile/model_tracer/TensorUtils.cpp
new file mode 100644
index 0000000..9a23814
--- /dev/null
+++ b/torch/csrc/jit/mobile/model_tracer/TensorUtils.cpp
@@ -0,0 +1,42 @@
+#include <c10/util/Exception.h>
+#include <torch/csrc/jit/mobile/model_tracer/TensorUtils.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+void for_each_tensor_in_ivalue(
+    const c10::IValue& iv,
+    std::function<void(const ::at::Tensor&)> const& func) {
+  const bool is_leaf_type = iv.isString() || iv.isNone() || iv.isScalar() ||
+      iv.isDouble() || iv.isInt() || iv.isBool() || iv.isDevice() ||
+      iv.isIntList() || iv.isDoubleList() || iv.isBoolList();
+  if (is_leaf_type) {
+    // Do Nothing.
+    return;
+  }
+
+  if (iv.isTensor()) {
+    func(iv.toTensor());
+  } else if (iv.isTuple()) {
+    c10::intrusive_ptr<at::ivalue::Tuple> tup_ptr = iv.toTuple();
+    for (const auto& e : tup_ptr->elements()) {
+      for_each_tensor_in_ivalue(e, func);
+    }
+  } else if (iv.isList()) {
+    c10::List<c10::IValue> l = iv.toList();
+    for (auto&& i : l) {
+      c10::IValue item = i;
+      for_each_tensor_in_ivalue(item, func);
+    }
+  } else if (iv.isGenericDict()) {
+    c10::Dict<c10::IValue, c10::IValue> dict = iv.toGenericDict();
+    for (auto& it : dict) {
+      for_each_tensor_in_ivalue(it.value(), func);
+    }
+  } else {
+    AT_ERROR("Unhandled type of IValue. Got ", iv.tagKind());
+  }
+}
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/model_tracer/TensorUtils.h b/torch/csrc/jit/mobile/model_tracer/TensorUtils.h
new file mode 100644
index 0000000..6837a7b
--- /dev/null
+++ b/torch/csrc/jit/mobile/model_tracer/TensorUtils.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+/**
+ * Recursively scan the IValue object, traversing lists, tuples, dicts, and stop
+ * and call the user provided callback function 'func' when a Tensor is found.
+ */
+void for_each_tensor_in_ivalue(
+    const ::c10::IValue& iv,
+    std::function<void(const ::at::Tensor&)> const& func);
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/mobile/model_tracer/tracer.cpp b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
new file mode 100644
index 0000000..72fa6cd
--- /dev/null
+++ b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
@@ -0,0 +1,327 @@
+#include <iostream>
+#include <string>
+
+/**
+ * The tracer.cpp generates a binary that accepts a TorchScript model or a
+ * Torch Mobile Model (with bytecode.pkl) which has at least 1 bundled
+ * input. This binary then feeds the bundled input(s) into the model
+ * and executes using the lite interpreter.
+ *
+ * Both root operators as well as called operators are recorded and saved
+ * into a YAML file (whose path is provided on the command line).
+ *
+ * Note: Root operators may include primary and other operators that
+ * are not invoked using the dispatcher, and hence they may not show
+ * up in the Traced Operator list.
+ *
+ */
+
+#include <ATen/core/dispatch/ObservedOperators.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h>
+#include <torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h>
+#include <torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h>
+#include <torch/csrc/jit/mobile/model_tracer/TensorUtils.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/parse_operators.h>
+#include <torch/script.h>
+
+typedef std::map<std::string, std::set<std::string>> kt_type;
+
+C10_DEFINE_string(
+    model_input_path,
+    "",
+    "The path of the input model file (.ptl).");
+
+C10_DEFINE_string(
+    build_yaml_path,
+    "",
+    "The path of the output YAML file containing traced operator information.");
+
+#define REQUIRE_STRING_ARG(name)                            \
+  if (FLAGS_##name.empty()) {                               \
+    std::cerr << "You must specify the flag --" #name "\n"; \
+    return 1;                                               \
+  }
+
+#define REQUIRE_INT_ARG(name)                               \
+  if (FLAGS_##name == -1) {                                 \
+    std::cerr << "You must specify the flag --" #name "\n"; \
+    return 1;                                               \
+  }
+
+const std::vector<std::string> always_included_traced_ops = {
+    // The following are called from setup sections.
+    "aten::resize_",
+    "aten::slice.Tensor",
+};
+
+// Fetched from caffe2/aten/src/ATen/native/metal/MetalAten.mm
+// Diffusion Link: https://fburl.com/diffusion/atwwmax2
+const std::vector<std::string> gpu_metal_operators = {
+    "aten::conv2d",
+    "aten::add.Tensor",
+    "aten::add_.Tensor",
+    "aten::addmm",
+    "aten::empty.memory_format",
+    "aten::empty_strided",
+    "aten::log_softmax.int",
+    "aten::max_pool2d",
+    "aten::mul.Tensor",
+    "aten::relu",
+    "aten::relu_",
+    "aten::sigmoid",
+    "aten::sub.Tensor",
+    "aten::upsample_nearest2d.vec",
+    "aten::view",
+    "aten::adaptive_avg_pool2d",
+    "aten::hardtanh_",
+    "aten::reshape",
+    "aten::flatten.using_ints",
+};
+
+void printOpYAML(
+    std::ostream& out,
+    int indent,
+    const std::string& op_name,
+    bool is_used_for_training,
+    bool is_root_operator,
+    bool include_all_overloads) {
+  out << std::string(indent, ' ') << op_name << ":" << std::endl;
+  out << std::string(indent + 2, ' ')
+      << "is_used_for_training: " << (is_used_for_training ? "true" : "false")
+      << std::endl;
+  out << std::string(indent + 2, ' ')
+      << "is_root_operator: " << (is_root_operator ? "true" : "false")
+      << std::endl;
+  out << std::string(indent + 2, ' ')
+      << "include_all_overloads: " << (include_all_overloads ? "true" : "false")
+      << std::endl;
+}
+
+void printOpsYAML(
+    std::ostream& out,
+    const std::set<std::string>& operator_list,
+    bool is_used_for_training,
+    bool is_root_operator,
+    bool include_all_overloads) {
+  for (auto& it : operator_list) {
+    printOpYAML(out, 2, it, false, is_root_operator, false);
+  }
+}
+
+/**
+ * These are a collection of some common ATen methods that are usually
+ * called outside of the Model's forward() run, and they need to be
+ * traced to ensure that the used operators are included in the build.
+ * If/When this list becomes too long, we can consider making it a
+ * per-model list.
+ */
+void call_setup_methods() {
+  at::zeros({2, 2});
+  at::ones({2, 2});
+  at::Tensor t1 = at::empty({7, 7});
+  at::Tensor t2 = t1.fill_(3);
+  at::narrow(t2, 1, 0, 1);
+  at::eq(t1, t2);
+  const volatile bool nz = at::zeros({1}).is_nonzero();
+  (void)nz;
+
+  // Create a byte tensor and copy it
+  auto zb = at::zeros({10}, at::kByte);
+  auto zf = at::zeros({10}, at::kFloat);
+  zb.copy_(zf);
+  t2.div(1);
+
+  // Typically, failures show up in CopyKernel.cpp, so enumerating
+  // common dtypes that may show up.
+  const auto all_dtypes_for_copy = {
+      at::kByte,
+      at::kFloat,
+      at::kInt,
+      at::kChar,
+      at::kDouble,
+      at::kShort,
+      at::kLong};
+  for (const auto dtype : all_dtypes_for_copy) {
+    auto tensor1 = at::empty({10}, dtype);
+    tensor1.copy_(at::zeros({10}, at::kFloat));
+  }
+
+  torch::zeros({0, 0}, torch::ScalarType::Float);
+  std::vector<float> storage(20, 1.0);
+  std::vector<int64_t> sizes({2, 10});
+  torch::from_blob(storage.data(), at::IntArrayRef(sizes), at::kFloat);
+}
+
+/**
+ * Call methods on the Tensor object that we expect to be called
+ * in production on this Tensor.
+ */
+void consume_tensor(const at::Tensor& t) {
+  const at::Tensor& c = t;
+  c.copy_(t.cpu());
+}
+
+void run_model(
+    const std::string& input_module_path,
+    std::set<std::string>& root_ops,
+    std::set<std::string>& enabled_backends,
+    torch::jit::mobile::KernelDTypeTracer::kernel_tags_type&
+        called_kernel_tags) {
+  // Load the module on CPU with the flag to skip the operator exists check.
+  // This is needed so that we can load any TorchBind objects (custom classes)
+  // that this model refers to so that any operators being called from those
+  // TorchBind objects can be traced by the model tracer.
+  //
+  torch::jit::mobile::MobileModelRunner module_runner(input_module_path, 0);
+  root_ops = module_runner.get_root_operators();
+  std::cout << "Got " << root_ops.size() << " Root Operators." << std::endl;
+
+  if (torch::jit::mobile::MobileModelRunner::set_has_metal_gpu_operators(
+          root_ops)) {
+    std::cout << "Inferred Metal GPU Model." << std::endl;
+    root_ops.insert(gpu_metal_operators.begin(), gpu_metal_operators.end());
+    called_kernel_tags["__unused__"] = {"Float"};
+    enabled_backends.insert("Metal GPU");
+
+    // When we encounter a GPU model, we should call .cpu().copy_() on the
+    // tensors in the bundled inputs, since this is what will happen when
+    // such a model is executed on an iOS device (to copy the Tensor to Metal
+    // memory via a call to .metal()).
+    module_runner.for_each_tensor_in_bundled_inputs(consume_tensor);
+  } else {
+    std::cout << "Inferred CPU Model." << std::endl;
+    enabled_backends.insert("CPU");
+    torch::jit::mobile::MobileModelRunner mobile_module_runner(
+        input_module_path);
+
+    // When we encounter a CPU model, we should call .cpu().copy_() on the
+    // tensors in the bundled inputs, since this is what will happen when
+    // such a model is executed on an Android device since the PyTorch JNI
+    // bindings call .cpu() in JIValue::newJIValueFromAtIValue().
+    module_runner.for_each_tensor_in_bundled_inputs(consume_tensor);
+
+    // If a user has bundled inputs since that api was updated to accept
+    // bundled inputs for multiple methods They should go down this route.
+    // Even if they only bundle inputs for forward they will have the new
+    // style bundled inputs. Since at this time in tracer.cpp we do not know
+    // what functions have bundled inputs we must call
+    // get_bundled_inputs_functions_and_info if it exists to get the set.
+    if (mobile_module_runner.has_new_style_bundled_inputs()) {
+      auto bundled_inputs_mapping =
+          mobile_module_runner.get_many_functions_bundled_inputs();
+      for (auto& entry : bundled_inputs_mapping) {
+        std::string function_name = entry.first;
+        std::vector<std::vector<at::IValue>> bundled_inputs = entry.second;
+        std::cout << "Got " << bundled_inputs.size() << " bundled input(s) for "
+                  << function_name << "\n\n";
+        std::vector<at::IValue> results =
+            mobile_module_runner.run_with_inputs(function_name, bundled_inputs);
+
+        for (auto& result : results) {
+          // Consume the result Tensor(s) when tracing on CPU since the
+          // Android/Java JNI bindings will do the same.
+          torch::jit::mobile::for_each_tensor_in_ivalue(result, consume_tensor);
+        }
+      }
+      // If get_bundled_inputs_functions_and_info does not exists we default
+      // to assuming they bundled before that change was made. If no bundled
+      // inputs are found here either an error will be thrown
+    } else {
+      std::vector<std::vector<at::IValue>> bundled_inputs =
+          mobile_module_runner.get_all_bundled_inputs();
+      std::cout << "Got " << bundled_inputs.size() << " bundled input(s)\n\n";
+      std::vector<at::IValue> results =
+          mobile_module_runner.run_with_inputs(bundled_inputs);
+
+      for (auto& result : results) {
+        // Consume the result Tensor(s) when tracing on CPU since the
+        // Android/Java JNI bindings will do the same.
+        torch::jit::mobile::for_each_tensor_in_ivalue(result, consume_tensor);
+      }
+    }
+  }
+}
+
+/**
+ * Converts a pytorch model (full/lite) to lite interpreter model for
+ * mobile, and additionally writes out a list of root and called
+ * operators.
+ */
+int main(int argc, char* argv[]) {
+  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
+    std::cerr << "Failed to parse command line flags!" << std::endl;
+    return 1;
+  }
+
+  REQUIRE_STRING_ARG(model_input_path);
+  REQUIRE_STRING_ARG(build_yaml_path);
+
+  const std::string input_module_path = FLAGS_model_input_path;
+
+  std::ofstream yaml_out(FLAGS_build_yaml_path);
+
+  std::cout << "Processing: " << input_module_path << std::endl;
+  std::cout << "Output: " << FLAGS_build_yaml_path << std::endl;
+
+  at::globalContext().setQEngine(at::QEngine::QNNPACK);
+  c10::ObservedOperators::getUnobservedOperatorList().clear();
+
+  torch::jit::mobile::OperatorCallTracer op_tracer;
+  torch::jit::mobile::KernelDTypeTracer kdtype_tracer;
+
+  call_setup_methods();
+
+  std::set<std::string> root_ops, traced_operators, enabled_backends;
+  torch::jit::mobile::KernelDTypeTracer::kernel_tags_type called_kernel_tags;
+
+  using torch::jit::MobileModuleLoadOptions;
+
+  // run with QNNPACK
+  run_model(input_module_path, root_ops, enabled_backends, called_kernel_tags);
+  at::globalContext().setQEngine(at::QEngine::FBGEMM);
+  run_model(input_module_path, root_ops, enabled_backends, called_kernel_tags);
+
+  traced_operators = op_tracer.getCalledOperators();
+  called_kernel_tags.insert(
+      kdtype_tracer.getCalledKernelTags().begin(),
+      kdtype_tracer.getCalledKernelTags().end());
+  traced_operators.insert(
+      always_included_traced_ops.begin(), always_included_traced_ops.end());
+
+  if (traced_operators.size() <= always_included_traced_ops.size()) {
+    std::cerr
+        << c10::str(
+               "Error traced_operators size: ",
+               traced_operators.size(),
+               ". Expected the traced operator list to be bigger then the default size ",
+               always_included_traced_ops.size(),
+               ". Please report a bug in PyTorch.")
+        << std::endl;
+  }
+
+  // If the op exist in both traced_ops and root_ops, leave it in root_ops only
+  for (const auto& root_op : root_ops) {
+    if (traced_operators.find(root_op) != traced_operators.end()) {
+      traced_operators.erase(root_op);
+    }
+  }
+
+  yaml_out << "include_all_kernel_dtypes: true" << std::endl;
+  yaml_out << "operators:" << std::endl;
+  printOpsYAML(
+      yaml_out,
+      root_ops,
+      false /* is_used_for_training */,
+      true /* is_root_operator */,
+      false /* include_all_overloads */);
+  printOpsYAML(
+      yaml_out,
+      traced_operators,
+      false /* is_used_for_training */,
+      false /* is_root_operator */,
+      false /* include_all_overloads */);
+  return 0;
+}