Metal gpu throughput benchmarking app. PiperOrigin-RevId: 409903272 Change-Id: I7ae7cfd936915a93f836563581b7439b1b197af2

commit: 5a1ba402478b208c96ceb4f33abb9b53622c50e9 [log] [tgz]
author: Raman Sarokin <sorokin@google.com> Mon Nov 15 00:24:11 2021 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> Mon Nov 15 00:28:28 2021 -0800
tree: 1a9c6a5089a290652fca238212b942efb62549b3
parent: 251085598b75751dc46f9ba3961d2f4ad66fa7e5 [diff]
diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD b/tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD
new file mode 100644
index 0000000..708fe0e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD

@@ -0,0 +1,67 @@
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application")
+load("@build_bazel_rules_apple//apple:macos.bzl", "macos_application")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+objc_library(
+    name = "benchmark_lib",
+    srcs = ["main.mm"],
+    data = glob([
+        "models/*.tflite",
+    ]),
+    sdk_frameworks = [
+        "Metal",
+        "QuartzCore",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_builder",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:precision",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
+        "//tensorflow/lite/delegates/gpu/metal:inference_context",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+    ],
+)
+
+# Applications for local benchmarking in XCode
+ios_application(
+    name = "iOSBenchmark",
+    testonly = 1,
+    bundle_id = "com.tensorflow.lite.delegates.gpu.metal.benchmarking",
+    families = [
+        "iphone",
+        "ipad",
+    ],
+    infoplists = ["Info.plist"],
+    minimum_os_version = "12.0",
+    provisioning_profile = "//tensorflow/lite/delegates/gpu/metal/benchmarking:provisioning_profile.mobileprovision",
+    tags = [
+        "local",
+        "notap",
+    ],
+    deps = [":benchmark_lib"],
+)
+
+macos_application(
+    name = "MacOSBenchmark",
+    testonly = 1,
+    bundle_id = "com.tensorflow.lite.delegates.gpu.metal.benchmarking",
+    infoplists = ["Info.plist"],
+    minimum_os_version = "10.13",
+    tags = [
+        "local",
+        "notap",
+    ],
+    deps = [":benchmark_lib"],
+)

diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/Benchmark.tulsiproj/Configs/Benchmark.tulsigen b/tensorflow/lite/delegates/gpu/metal/benchmarking/Benchmark.tulsiproj/Configs/Benchmark.tulsigen
new file mode 100644
index 0000000..3fe6142
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/Benchmark.tulsiproj/Configs/Benchmark.tulsigen

@@ -0,0 +1,17 @@
+{
+  "sourceFilters" : [
+    "tensorflow/lite/delegates/gpu/...",
+    "tensorflow/lite/delegates/gpu/metal/benchmarking/models",
+  ],
+  "buildTargets" : [
+    "//tensorflow/lite/delegates/gpu/metal/benchmarking:MacOSBenchmark",
+    "//tensorflow/lite/delegates/gpu/metal/benchmarking:iOSBenchmark",
+  ],
+  "projectName" : "Benchmark",
+  "additionalFilePaths" : [
+    "tensorflow/lite/delegates/gpu/BUILD",
+    "tensorflow/lite/delegates/gpu/metal/BUILD",
+    "tensorflow/lite/delegates/gpu/metal/kernels/BUILD",
+    "tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD",
+  ]
+}

diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/Benchmark.tulsiproj/project.tulsiconf b/tensorflow/lite/delegates/gpu/metal/benchmarking/Benchmark.tulsiproj/project.tulsiconf
new file mode 100644
index 0000000..ed7355f
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/Benchmark.tulsiproj/project.tulsiconf

@@ -0,0 +1,7 @@
+{
+  "packages" : [
+    "tensorflow/lite/delegates/gpu/metal"
+  ],
+  "projectName" : "Benchmark",
+  "workspaceRoot" : "../../../../../../.."
+}

diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/Info.plist b/tensorflow/lite/delegates/gpu/metal/benchmarking/Info.plist
new file mode 100644
index 0000000..e20e81a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/Info.plist

@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UIStatusBarStyle</key>
+	<string>UIStatusBarStyleLightContent</string>
+	<key>UIViewControllerBasedStatusBarAppearance</key>
+	<false/>
+</dict>
+</plist>
\ No newline at end of file

diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/main.mm b/tensorflow/lite/delegates/gpu/metal/benchmarking/main.mm
new file mode 100644
index 0000000..02be26b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/main.mm

@@ -0,0 +1,207 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the Licensgoe is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#import <Metal/Metal.h>
+
+#include <iostream>
+#include <string>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model_builder.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+namespace {
+
+absl::Status GPUBenchmark(GraphFloat32* graph, int num_tests, int iterations,
+                          bool use_fp16 = true) {
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+  std::string device_name = std::string([[device name] UTF8String]);
+  GpuInfo gpu_info;
+  GetGpuInfoFromDeviceDescription(device_name, GpuApi::kMetal, &gpu_info);
+  CalculationsPrecision precision;
+  if (use_fp16) {
+    if (gpu_info.IsRoundToNearestSupported()) {
+      precision = CalculationsPrecision::F16;
+    } else {
+      precision = CalculationsPrecision::F32_F16;
+    }
+  } else {
+    precision = CalculationsPrecision::F32;
+  }
+
+  InferenceContext::CreateInferenceInfo create_info;
+  create_info.precision = precision;
+  create_info.storage_type = TensorStorageType::BUFFER;
+  create_info.hints.Add(ModelHints::kAllowSpecialKernels);
+  InferenceContext inference_context;
+  RETURN_IF_ERROR(inference_context.InitFromGraphWithTransforms(create_info, graph, device));
+
+  id<MTLCommandQueue> command_queue = [device newCommandQueue];
+  bool kPerOpProfiling = false;
+  if (kPerOpProfiling) {
+    ProfilingInfo profiling_info;
+    inference_context.Profile(device, &profiling_info);
+    std::cout << profiling_info.GetDetailedReport() << std::endl;
+  }
+  const std::string precision_str = use_fp16 ? "FP16" : "FP32";
+  std::cout << "Measuring started: (" << num_tests << " tests, " << iterations
+      << " iterations every test, " << precision_str << " precision)" << std::endl;
+  for (int j = 0; j < num_tests; ++j) {
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iterations; ++i) {
+      @autoreleasepool {
+        id<MTLCommandBuffer> command_buffer = [command_queue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder =
+            [command_buffer computeCommandEncoder];
+        inference_context.EncodeWithEncoder(encoder);
+        [encoder endEncoding];
+        [command_buffer commit];
+        if (i == iterations - 1) {
+          [command_buffer waitUntilCompleted];
+        }
+      }
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+    double t0 = double(std::chrono::duration_cast<std::chrono::milliseconds>(
+                           end - start)
+                           .count()) /
+                iterations;
+    std::cout << "  Test: #" << j << " - " << t0 << "ms" << std::endl;
+  }
+  return absl::OkStatus();
+}
+
+class DelegateContext {
+ public:
+  bool Init(TfLiteContext* context,
+            const TfLiteDelegateParams* delegate_params) {
+    auto denormalized_graph =
+        reinterpret_cast<GraphFloat32*>(delegate_params->delegate->data_);
+    absl::Status status =
+        BuildModel(context, delegate_params, denormalized_graph);
+    if (!status.ok()) {
+      TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
+    }
+    return status.ok();
+  }
+};
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  const TfLiteRegistration kRegistration = {
+      .init = [](TfLiteContext* context, const char* buffer, size_t) -> void* {
+        auto* delegate_context = new DelegateContext();
+        if (!delegate_context->Init(
+                context,
+                reinterpret_cast<const TfLiteDelegateParams*>(buffer))) {
+          delete delegate_context;
+          return nullptr;
+        }
+        return delegate_context;
+      },
+      .free = [](TfLiteContext* context, void* buffer) -> void {
+        delete reinterpret_cast<DelegateContext*>(buffer);
+      },
+      .prepare = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+        return node->user_data ? kTfLiteOk : kTfLiteError;
+      },
+      .invoke = nullptr,
+  };
+
+  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+  const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, kRegistration, ops_to_replace, delegate);
+  TfLiteIntArrayFree(ops_to_replace);
+  return status;
+}
+
+absl::Status FlatBufferToGPUGraph(
+    const std::unique_ptr<tflite::FlatBufferModel>& flatbuffer,
+    GraphFloat32* graph) {
+  ops::builtin::BuiltinOpResolver op_resolver;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  tflite::InterpreterBuilder interpreter_builder(*flatbuffer, op_resolver);
+  if (interpreter_builder(&interpreter) != kTfLiteOk || !interpreter) {
+    return absl::InternalError("Unable to prepare TfLite interpreter.");
+  }
+  TfLiteDelegate delegate;
+  delegate.data_ = graph;
+  delegate.flags = kTfLiteDelegateFlagsNone;
+  delegate.Prepare = DelegatePrepare;
+  delegate.CopyFromBufferHandle = nullptr;
+  delegate.CopyToBufferHandle = nullptr;
+  delegate.FreeBufferHandle = nullptr;
+
+  if (interpreter->ModifyGraphWithDelegate(&delegate) != kTfLiteOk) {
+    return absl::InternalError("Conversion from TfLite model failed.");
+  }
+
+  ModelTransformer transformer(graph);
+  if (!ApplyModelTransformations(&transformer)) {
+    return absl::InternalError("Graph transformations failed");
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  @autoreleasepool {
+    NSBundle *main = [NSBundle mainBundle];
+    NSArray<NSString*>* model_paths = [main pathsForResourcesOfType:@"tflite" inDirectory:nil];
+    for (id model_path in model_paths) {
+      NSString *model_name = [[model_path lastPathComponent] stringByDeletingPathExtension];
+      std::string m_name = std::string([model_name UTF8String]);
+      std::string path = std::string([model_path UTF8String]);
+      std::cout << m_name << std::endl;
+      auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(path.c_str());
+      if (!flatbuffer) {
+        std::cout << "Failed flatbuffer reading." << std::endl;
+      }
+
+      tflite::gpu::GraphFloat32 graph;
+      auto s = tflite::gpu::metal::FlatBufferToGPUGraph(flatbuffer, &graph);
+      if (!s.ok()) {
+        std::cout << "Failed flatbuffer to graph conversion. " << s.message() << std::endl;
+      }
+
+      s = tflite::gpu::metal::GPUBenchmark(&graph, 5, 200, true);
+      if (!s.ok()) {
+        std::cout << "Error in GPUBenchmark. " << s.message() << std::endl;
+      }
+    }
+  }
+
+  return 0;
+}

diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/models/README.md b/tensorflow/lite/delegates/gpu/metal/benchmarking/models/README.md
new file mode 100644
index 0000000..1ca1483
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/models/README.md

@@ -0,0 +1 @@
+.tflite models must be added to this folder before xcodeproj generation
commit	5a1ba402478b208c96ceb4f33abb9b53622c50e9	[log] [tgz]
author	Raman Sarokin <sorokin@google.com>	Mon Nov 15 00:24:11 2021 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	Mon Nov 15 00:28:28 2021 -0800
tree	1a9c6a5089a290652fca238212b942efb62549b3
parent	251085598b75751dc46f9ba3961d2f4ad66fa7e5 [diff]