jni/run_tflite.cpp - platform/test/mlts/benchmark - Git at Google

 /**
  * Copyright 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "run_tflite.h"

 #include <android/log.h>
 #include <dirent.h>
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <ftw.h>
 #include <sys/time.h>
 #include <unistd.h>

 #include <cstdio>
 #include <fstream>

 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"

 #define LOG_TAG "NN_BENCHMARK"

 #define FATAL(fmt, ...)                                                  \
   do {                                                                   \
     __android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \
     assert(false);                                                       \
   } while (0)

 namespace {

 long long currentTimeInUsec() {
   timeval tv;
   gettimeofday(&tv, NULL);
   return ((tv.tv_sec * 1000000L) + tv.tv_usec);
 }

 // Workaround for build systems that make difficult to pick the correct NDK API
 // level. NDK tracing methods are dynamically loaded from libandroid.so.
 typedef void* (*fp_ATrace_beginSection)(const char* sectionName);
 typedef void* (*fp_ATrace_endSection)();
 struct TraceFunc {
   fp_ATrace_beginSection ATrace_beginSection;
   fp_ATrace_endSection ATrace_endSection;
 };
 TraceFunc setupTraceFunc() {
   void* lib = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
   if (lib == nullptr) {
     FATAL("unable to open libandroid.so");
   }
   return {
       reinterpret_cast<fp_ATrace_beginSection>(
           dlsym(lib, "ATrace_beginSection")),
       reinterpret_cast<fp_ATrace_endSection>(dlsym(lib, "ATrace_endSection"))};
 }
 static TraceFunc kTraceFunc{setupTraceFunc()};

 }  // namespace

 BenchmarkModel* BenchmarkModel::create(const char* modelfile, bool use_nnapi,
                                        bool enable_intermediate_tensors_dump, int* nnapiErrno,
                                        const char* nnapi_device_name, bool mmapModel,
                                        const char* nnapi_cache_dir) {
   BenchmarkModel* model = new BenchmarkModel();
   if (!model->init(modelfile, use_nnapi, enable_intermediate_tensors_dump, nnapiErrno,
                    nnapi_device_name, mmapModel, nnapi_cache_dir)) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to init model %s", modelfile);
     delete model;
     return nullptr;
   }
   return model;
 }

 bool BenchmarkModel::init(const char* modelfile, bool use_nnapi,
                           bool enable_intermediate_tensors_dump, int* nnapiErrno,
                           const char* nnapi_device_name, bool mmapModel,
                           const char* nnapi_cache_dir) {
   mModelFile = modelfile;
   mUseNnApi = use_nnapi;
   if (nnapi_cache_dir) {
     mCacheDir = nnapi_cache_dir;
   }
   if (nnapi_device_name) {
     mNnApiDeviceName = nnapi_device_name;
   }

   if (mmapModel) {
     // Memory map the model. NOTE this needs lifetime greater than or equal
     // to interpreter context.
     mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile);
   } else {
     std::ifstream t(modelfile);
     mModelBuffer = std::string((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
     mTfliteModel = tflite::FlatBufferModel::BuildFromBuffer(mModelBuffer.c_str(), mModelBuffer.size());
   }
   if (!mTfliteModel) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s",
                         modelfile);
     return false;
   }

   tflite::ops::builtin::BuiltinOpResolver resolver;
   tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter);
   if (!mTfliteInterpreter) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                         "Failed to create TFlite interpreter");
     return false;
   }

   if (enable_intermediate_tensors_dump) {
     // Make output of every op a model output. This way we will be able to
     // fetch each intermediate tensor when running with delegates.
     outputs.clear();
     for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) {
       auto node_outputs =
           mTfliteInterpreter->node_and_registration(node)->first.outputs;
       outputs.insert(outputs.end(), node_outputs->data,
                      node_outputs->data + node_outputs->size);
     }
     mTfliteInterpreter->SetOutputs(outputs);
   }

   // Allow Fp16 precision for all models
   mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true);

   if (use_nnapi) {
     tflite::StatefulNnApiDelegate::Options nnapi_options;
     nnapi_options.accelerator_name = nnapi_device_name;
     mTfliteNnapiDelegate = std::make_unique<tflite::StatefulNnApiDelegate>(nnapi_options);
     int delegationStatus = mTfliteInterpreter->ModifyGraphWithDelegate(mTfliteNnapiDelegate.get());
     *nnapiErrno = mTfliteNnapiDelegate->GetNnApiErrno();
     if (delegationStatus != kTfLiteOk ||
         *nnapiErrno != ANEURALNETWORKS_NO_ERROR) {
       __android_log_print(
           ANDROID_LOG_ERROR, LOG_TAG,
           "Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d",
           modelfile, *nnapiErrno);
       return false;
     }
   }
   return true;
 }

 BenchmarkModel::BenchmarkModel() {}
 BenchmarkModel::~BenchmarkModel() {}

 bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) {
   int input = mTfliteInterpreter->inputs()[0];
   auto* input_tensor = mTfliteInterpreter->tensor(input);

   switch (input_tensor->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8: {
       void* raw = input_tensor->data.raw;
       memcpy(raw, dataPtr, length);
       break;
     }
     default:
       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                           "Input tensor type not supported");
       return false;
   }
   return true;
 }
 void BenchmarkModel::saveInferenceOutput(InferenceResult* result,
                                          int output_index) {
   int output = mTfliteInterpreter->outputs()[output_index];
   auto* output_tensor = mTfliteInterpreter->tensor(output);
   auto& sink = result->inferenceOutputs[output_index];
   sink.insert(sink.end(), output_tensor->data.uint8,
               output_tensor->data.uint8 + output_tensor->bytes);
 }

 void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length,
                                     InferenceResult* result, int output_index) {
   int output = mTfliteInterpreter->outputs()[output_index];
   auto* output_tensor = mTfliteInterpreter->tensor(output);
   if (output_tensor->bytes != length) {
     FATAL("Wrong size of output tensor, expected %zu, is %zu",
           output_tensor->bytes, length);
   }

   size_t elements_count = 0;
   float err_sum = 0.0;
   float max_error = 0.0;
   switch (output_tensor->type) {
     case kTfLiteUInt8: {
       uint8_t* output_raw = mTfliteInterpreter->typed_tensor<uint8_t>(output);
       elements_count = output_tensor->bytes;
       for (size_t i = 0; i < output_tensor->bytes; ++i) {
         float err = ((float)output_raw[i]) - ((float)expected_data[i]);
         if (err > max_error) max_error = err;
         err_sum += err * err;
       }
       break;
     }
     case kTfLiteFloat32: {
       const float* expected = reinterpret_cast<const float*>(expected_data);
       float* output_raw = mTfliteInterpreter->typed_tensor<float>(output);
       elements_count = output_tensor->bytes / sizeof(float);
       for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) {
         float err = output_raw[i] - expected[i];
         if (err > max_error) max_error = err;
         err_sum += err * err;
       }
       break;
     }
     default:
       FATAL("Output sensor type %d not supported", output_tensor->type);
   }
   result->meanSquareErrors[output_index] = err_sum / elements_count;
   result->maxSingleErrors[output_index] = max_error;
 }

 bool BenchmarkModel::resizeInputTensors(std::vector<int> shape) {
   // The benchmark only expects single input tensor, hardcoded as 0.
   int input = mTfliteInterpreter->inputs()[0];
   mTfliteInterpreter->ResizeInputTensor(input, shape);
   if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                         "Failed to allocate tensors!");
     return false;
   }
   return true;
 }

 bool BenchmarkModel::runInference() {
   auto status = mTfliteInterpreter->Invoke();
   auto nnapi_errno = mTfliteNnapiDelegate
                          ? mTfliteNnapiDelegate->GetNnApiErrno()
                          : ANEURALNETWORKS_NO_ERROR;
   if (status != kTfLiteOk || nnapi_errno != ANEURALNETWORKS_NO_ERROR) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                         "Failed to invoke, tflite status: %d, nnapi errno: %d!",
                         (int)status, nnapi_errno);
     return false;
   }
   return true;
 }

 bool BenchmarkModel::resetStates() {
   auto status = mTfliteInterpreter->ResetVariableTensors();
   if (status != kTfLiteOk) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                         "Failed to reset variable tensors: %d!", (int)status);
     return false;
   }
   return true;
 }

 bool BenchmarkModel::benchmark(
     const std::vector<InferenceInOutSequence>& inOutData,
     int seqInferencesMaxCount, float timeout, int flags,
     std::vector<InferenceResult>* results) {
   if (inOutData.empty()) {
     __android_log_print(ANDROID_LOG_WARN, LOG_TAG,
                         "Input/output vector is empty");
     return true;
   }

   float inferenceTotal = 0.0;
   for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount;
        ++seqInferenceIndex) {
     resetStates();

     const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size();
     const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex];
     for (int i = 0; i < seq.size(); ++i) {
       const InferenceInOut& data = seq[i];

       // For NNAPI systrace usage documentation, see
       // frameworks/ml/nn/common/include/Tracing.h.
       kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark");
       kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input");
       if (data.input) {
         setInput(data.input, data.input_size);
       } else {
         int input = mTfliteInterpreter->inputs()[0];
         auto* input_tensor = mTfliteInterpreter->tensor(input);
         if (!data.createInput((uint8_t*)input_tensor->data.raw,
                               input_tensor->bytes)) {
           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                               "Input creation %d failed", i);
           return false;
         }
       }
       kTraceFunc.ATrace_endSection();
       long long startTime = currentTimeInUsec();
       const bool success = runInference();
       kTraceFunc.ATrace_endSection();
       long long endTime = currentTimeInUsec();
       if (!success) {
         __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
                             i);
         return false;
       }

       float inferenceTime =
           static_cast<float>(endTime - startTime) / 1000000.0f;
       size_t outputsCount = mTfliteInterpreter->outputs().size();
       InferenceResult result{
           inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i};
       result.meanSquareErrors.resize(outputsCount);
       result.maxSingleErrors.resize(outputsCount);
       result.inferenceOutputs.resize(outputsCount);

       if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) {
         if (outputsCount != data.outputs.size()) {
           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                               "Golden/actual outputs (%zu/%zu) count mismatch",
                               data.outputs.size(), outputsCount);
           return false;
         }
         for (int j = 0; j < outputsCount; ++j) {
           getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j);
         }
       }

       if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) {
         for (int j = 0; j < outputsCount; ++j) {
           saveInferenceOutput(&result, j);
         }
       }
       results->push_back(result);
       inferenceTotal += inferenceTime;
     }

     // Timeout?
     if (timeout > 0.001 && inferenceTotal > timeout) {
       return true;
     }
   }
   return true;
 }

 // If cacheDir is not nullptr, compilation caching will be used with NNAPI.
 bool BenchmarkModel::runCompilation(const char* cacheDir) {
   std::unique_ptr<tflite::Interpreter> interpreter;
   tflite::ops::builtin::BuiltinOpResolver resolver;
   tflite::InterpreterBuilder(*mTfliteModel, resolver)(&interpreter);
   if (!interpreter) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to create TFlite interpreter");
     return false;
   }

   // Allow Fp16 precision for all models
   interpreter->SetAllowFp16PrecisionForFp32(true);

   if (mUseNnApi) {
     tflite::StatefulNnApiDelegate::Options nnapi_options;
     nnapi_options.accelerator_name = mNnApiDeviceName.empty() ? nullptr : mNnApiDeviceName.c_str();
     if (cacheDir) {
       nnapi_options.cache_dir = cacheDir;
       nnapi_options.model_token = mModelFile.c_str();
     }
     tflite::StatefulNnApiDelegate delegate(nnapi_options);
     int delegationStatus = interpreter->ModifyGraphWithDelegate(&delegate);
     auto nnapiErrno = delegate.GetNnApiErrno();
     if (delegationStatus != kTfLiteOk || nnapiErrno != ANEURALNETWORKS_NO_ERROR) {
       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                           "Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d",
                           mModelFile.c_str(), nnapiErrno);
       return false;
     }
   }
   return true;
 }

 // A helper class to manage the lifetime of a temporary cache directory.
 class ScopedTempDirectory {
  public:
   ScopedTempDirectory(std::string base) : mBase(std::move(base)) {}
   ~ScopedTempDirectory() { cleanup(); }

   // Create a new temp directory, remove the old one if needed.
   void recreate() {
     cleanup();
     mTempDir = mBase + "/XXXXXX";
     mkdtemp(&mTempDir[0]);
   }

   // Get the path to the temp directory.
   const char* get() const { return mTempDir.empty() ? nullptr : mTempDir.c_str(); }

  private:
   void cleanup() {
     if (mTempDir.empty()) {
       return;
     }
     auto callback = [](const char* entry, const struct stat*, int, struct FTW*) {
       return remove(entry);
     };
     nftw(mTempDir.c_str(), callback, 128, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
     mTempDir.clear();
   }

   std::string mBase;
   std::string mTempDir;
 };

 bool BenchmarkModel::getCompilationCacheSize(int* cacheSizeBytes) {
   if (cacheSizeBytes == nullptr) return false;

   // Create cache files.
   ScopedTempDirectory tempDir(mCacheDir.value());
   tempDir.recreate();
   const bool success = runCompilation(tempDir.get());
   if (!success) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed");
     return false;
   }

   // Compute total size of cache files.
   int totalSize = 0;
   DIR* dir = opendir(tempDir.get());
   if (dir == nullptr) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to open cache directory");
     return false;
   }
   struct dirent* dp = nullptr;
   while ((dp = readdir(dir)) != nullptr) {
     char fullPath[1024];
     snprintf(fullPath, 1024, "%s/%s", tempDir.get(), dp->d_name);
     struct stat st;
     int err = stat(fullPath, &st);
     if (err != 0) {
       closedir(dir);
       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to stat %s", fullPath);
       return false;
     }
     // Only accumulate sizes of regular files. This will exclude '.' and '..'.
     if (S_ISREG(st.st_mode)) {
       totalSize += st.st_size;
     }
   }
   closedir(dir);
   *cacheSizeBytes = totalSize;
   return true;
 }

 bool BenchmarkModel::benchmarkSingleTypeOfCompilation(CompilationBenchmarkType type,
                                                       int maxNumIterations, float timeout,
                                                       std::vector<float>* results) {
   if (results != nullptr) {
     results->clear();
   }
   ScopedTempDirectory tempDir(mCacheDir.value());

   // Initialize cache files to benchmark cache hit.
   if (type == CompilationBenchmarkType::PREPARE_FROM_CACHE) {
     tempDir.recreate();
     const bool success = runCompilation(tempDir.get());
     if (!success) {
       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed");
       return false;
     }
   }

   float compilationTotal = 0.0;
   for (int i = 0; i < maxNumIterations; i++) {
     const char* cacheDir = nullptr;
     switch (type) {
       case CompilationBenchmarkType::WITHOUT_CACHE:
         cacheDir = nullptr;
         break;
       case CompilationBenchmarkType::SAVE_TO_CACHE:
         // Remove the cache files from the last iteration to benchmark cache miss.
         tempDir.recreate();
         [[fallthrough]];
       case CompilationBenchmarkType::PREPARE_FROM_CACHE:
         cacheDir = tempDir.get();
         break;
       default:
         __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Unknown CompilationBenchmarkType: %d",
                             static_cast<int>(type));
         return false;
     }

     kTraceFunc.ATrace_beginSection("[NN_LA_PC]BenchmarkModel::benchmarkCompilation");
     const long long startTime = currentTimeInUsec();
     const bool success = runCompilation(cacheDir);
     const long long endTime = currentTimeInUsec();
     kTraceFunc.ATrace_endSection();
     if (!success) {
       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Compilation %d failed", i);
       return false;
     }

     const float compilationTime = static_cast<float>(endTime - startTime) / 1000000.0f;
     if (results != nullptr) {
       results->push_back(compilationTime);
     }

     // Timeout?
     compilationTotal += compilationTime;
     if (timeout > 0.001 && compilationTotal > timeout) {
       return true;
     }
   }
   return true;
 }

 bool BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup(CompilationBenchmarkType type,
                                                                 int maxNumIterations,
                                                                 float warmupTimeout,
                                                                 float runTimeout,
                                                                 std::vector<float>* results) {
   kTraceFunc.ATrace_beginSection(
           "[NN_LA_PWM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup");
   bool success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, warmupTimeout, nullptr);
   kTraceFunc.ATrace_endSection();
   if (!success) return false;

   kTraceFunc.ATrace_beginSection(
           "[NN_LA_PBM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup");
   success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, runTimeout, results);
   kTraceFunc.ATrace_endSection();
   return success;
 }

 bool BenchmarkModel::benchmarkCompilation(int maxNumIterations, float warmupTimeout,
                                           float runTimeout, CompilationBenchmarkResult* result) {
   if (result == nullptr) return false;

   // Benchmark compile without cache.
   bool success = benchmarkSingleTypeOfCompilationWithWarmup(
           CompilationBenchmarkType::WITHOUT_CACHE, maxNumIterations, warmupTimeout, runTimeout,
           &result->compileWithoutCacheTimeSec);
   if (!success) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                         "Failed to benchmark compilation without cache");
     return false;
   }

   // Get compilation cache size.
   success = getCompilationCacheSize(&result->cacheSizeBytes);
   if (!success) {
     __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to retrieve compilation cache size");
     return false;
   }

   // Benchmark saving to cache and preparing from cache only if supported.
   if (result->cacheSizeBytes > 0) {
     // Benchmark saving to cache.
     auto& saveToCacheTimeSec = result->saveToCacheTimeSec.emplace();
     success = benchmarkSingleTypeOfCompilationWithWarmup(
             CompilationBenchmarkType::SAVE_TO_CACHE, maxNumIterations, warmupTimeout, runTimeout,
             &saveToCacheTimeSec);
     if (!success) {
       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark saving to cache");
       return false;
     }

     // Benchmark preparing from cache.
     auto& prepareFromCacheTimeSec = result->prepareFromCacheTimeSec.emplace();
     success = benchmarkSingleTypeOfCompilationWithWarmup(
             CompilationBenchmarkType::PREPARE_FROM_CACHE, maxNumIterations, warmupTimeout,
             runTimeout, &prepareFromCacheTimeSec);
     if (!success) {
       __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark preparing from cache");
       return false;
     }
   }
   return result;
 }

 bool BenchmarkModel::dumpAllLayers(
     const char* path, const std::vector<InferenceInOutSequence>& inOutData) {
   if (inOutData.empty()) {
     FATAL("Input/output vector is empty");
   }

   for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size();
        ++seqInferenceIndex) {
     resetStates();

     const InferenceInOutSequence& seq = inOutData[seqInferenceIndex];
     for (int i = 0; i < seq.size(); ++i) {
       const InferenceInOut& data = seq[i];
       setInput(data.input, data.input_size);
       const bool success = runInference();
       if (!success) {
         __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
                             i);
         return false;
       }

       // The order of the tensor is not sorted by the tensor index
       for (int tensor_order = 0; tensor_order < outputs.size(); ++tensor_order) {
         int tensor_index = outputs[tensor_order];
         auto* output_tensor = mTfliteInterpreter->tensor(tensor_index);
         if (output_tensor->data.raw == nullptr) {
           __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
                       "output_tensor->data.raw == nullptr at index %d ", tensor_index);
           continue;
         }
         char fullpath[1024];
         snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_order_%.3d_tensor_%.3d", path,
                  seqInferenceIndex, i, tensor_order, tensor_index);
         FILE* f = fopen(fullpath, "wb");
         fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f);
         fclose(f);
       }
     }
   }
   return true;
 }
	/**
	* Copyright 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "run_tflite.h"

	#include <android/log.h>
	#include <dirent.h>
	#include <dlfcn.h>
	#include <fcntl.h>
	#include <ftw.h>
	#include <sys/time.h>
	#include <unistd.h>

	#include <cstdio>
	#include <fstream>

	#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
	#include "tensorflow/lite/kernels/register.h"
	#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"

	#define LOG_TAG "NN_BENCHMARK"

	#define FATAL(fmt, ...) \
	do { \
	__android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \
	assert(false); \
	} while (0)

	namespace {

	long long currentTimeInUsec() {
	timeval tv;
	gettimeofday(&tv, NULL);
	return ((tv.tv_sec * 1000000L) + tv.tv_usec);
	}

	// Workaround for build systems that make difficult to pick the correct NDK API
	// level. NDK tracing methods are dynamically loaded from libandroid.so.
	typedef void* (fp_ATrace_beginSection)(const char sectionName);
	typedef void* (*fp_ATrace_endSection)();
	struct TraceFunc {
	fp_ATrace_beginSection ATrace_beginSection;
	fp_ATrace_endSection ATrace_endSection;
	};
	TraceFunc setupTraceFunc() {
	void* lib = dlopen("libandroid.so", RTLD_NOW \| RTLD_LOCAL);
	if (lib == nullptr) {
	FATAL("unable to open libandroid.so");
	}
	return {
	reinterpret_cast<fp_ATrace_beginSection>(
	dlsym(lib, "ATrace_beginSection")),
	reinterpret_cast<fp_ATrace_endSection>(dlsym(lib, "ATrace_endSection"))};
	}
	static TraceFunc kTraceFunc{setupTraceFunc()};

	} // namespace

	BenchmarkModel* BenchmarkModel::create(const char* modelfile, bool use_nnapi,
	bool enable_intermediate_tensors_dump, int* nnapiErrno,
	const char* nnapi_device_name, bool mmapModel,
	const char* nnapi_cache_dir) {
	BenchmarkModel* model = new BenchmarkModel();
	if (!model->init(modelfile, use_nnapi, enable_intermediate_tensors_dump, nnapiErrno,
	nnapi_device_name, mmapModel, nnapi_cache_dir)) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to init model %s", modelfile);
	delete model;
	return nullptr;
	}
	return model;
	}

	bool BenchmarkModel::init(const char* modelfile, bool use_nnapi,
	bool enable_intermediate_tensors_dump, int* nnapiErrno,
	const char* nnapi_device_name, bool mmapModel,
	const char* nnapi_cache_dir) {
	mModelFile = modelfile;
	mUseNnApi = use_nnapi;
	if (nnapi_cache_dir) {
	mCacheDir = nnapi_cache_dir;
	}
	if (nnapi_device_name) {
	mNnApiDeviceName = nnapi_device_name;
	}

	if (mmapModel) {
	// Memory map the model. NOTE this needs lifetime greater than or equal
	// to interpreter context.
	mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile);
	} else {
	std::ifstream t(modelfile);
	mModelBuffer = std::string((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
	mTfliteModel = tflite::FlatBufferModel::BuildFromBuffer(mModelBuffer.c_str(), mModelBuffer.size());
	}
	if (!mTfliteModel) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s",
	modelfile);
	return false;
	}

	tflite::ops::builtin::BuiltinOpResolver resolver;
	tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter);
	if (!mTfliteInterpreter) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Failed to create TFlite interpreter");
	return false;
	}

	if (enable_intermediate_tensors_dump) {
	// Make output of every op a model output. This way we will be able to
	// fetch each intermediate tensor when running with delegates.
	outputs.clear();
	for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) {
	auto node_outputs =
	mTfliteInterpreter->node_and_registration(node)->first.outputs;
	outputs.insert(outputs.end(), node_outputs->data,
	node_outputs->data + node_outputs->size);
	}
	mTfliteInterpreter->SetOutputs(outputs);
	}

	// Allow Fp16 precision for all models
	mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true);

	if (use_nnapi) {
	tflite::StatefulNnApiDelegate::Options nnapi_options;
	nnapi_options.accelerator_name = nnapi_device_name;
	mTfliteNnapiDelegate = std::make_unique<tflite::StatefulNnApiDelegate>(nnapi_options);
	int delegationStatus = mTfliteInterpreter->ModifyGraphWithDelegate(mTfliteNnapiDelegate.get());
	*nnapiErrno = mTfliteNnapiDelegate->GetNnApiErrno();
	if (delegationStatus != kTfLiteOk \|\|
	*nnapiErrno != ANEURALNETWORKS_NO_ERROR) {
	__android_log_print(
	ANDROID_LOG_ERROR, LOG_TAG,
	"Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d",
	modelfile, *nnapiErrno);
	return false;
	}
	}
	return true;
	}

	BenchmarkModel::BenchmarkModel() {}
	BenchmarkModel::~BenchmarkModel() {}

	bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) {
	int input = mTfliteInterpreter->inputs()[0];
	auto* input_tensor = mTfliteInterpreter->tensor(input);

	switch (input_tensor->type) {
	case kTfLiteFloat32:
	case kTfLiteUInt8: {
	void* raw = input_tensor->data.raw;
	memcpy(raw, dataPtr, length);
	break;
	}
	default:
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Input tensor type not supported");
	return false;
	}
	return true;
	}
	void BenchmarkModel::saveInferenceOutput(InferenceResult* result,
	int output_index) {
	int output = mTfliteInterpreter->outputs()[output_index];
	auto* output_tensor = mTfliteInterpreter->tensor(output);
	auto& sink = result->inferenceOutputs[output_index];
	sink.insert(sink.end(), output_tensor->data.uint8,
	output_tensor->data.uint8 + output_tensor->bytes);
	}

	void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length,
	InferenceResult* result, int output_index) {
	int output = mTfliteInterpreter->outputs()[output_index];
	auto* output_tensor = mTfliteInterpreter->tensor(output);
	if (output_tensor->bytes != length) {
	FATAL("Wrong size of output tensor, expected %zu, is %zu",
	output_tensor->bytes, length);
	}

	size_t elements_count = 0;
	float err_sum = 0.0;
	float max_error = 0.0;
	switch (output_tensor->type) {
	case kTfLiteUInt8: {
	uint8_t* output_raw = mTfliteInterpreter->typed_tensor<uint8_t>(output);
	elements_count = output_tensor->bytes;
	for (size_t i = 0; i < output_tensor->bytes; ++i) {
	float err = ((float)output_raw[i]) - ((float)expected_data[i]);
	if (err > max_error) max_error = err;
	err_sum += err * err;
	}
	break;
	}
	case kTfLiteFloat32: {
	const float* expected = reinterpret_cast<const float*>(expected_data);
	float* output_raw = mTfliteInterpreter->typed_tensor<float>(output);
	elements_count = output_tensor->bytes / sizeof(float);
	for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) {
	float err = output_raw[i] - expected[i];
	if (err > max_error) max_error = err;
	err_sum += err * err;
	}
	break;
	}
	default:
	FATAL("Output sensor type %d not supported", output_tensor->type);
	}
	result->meanSquareErrors[output_index] = err_sum / elements_count;
	result->maxSingleErrors[output_index] = max_error;
	}

	bool BenchmarkModel::resizeInputTensors(std::vector<int> shape) {
	// The benchmark only expects single input tensor, hardcoded as 0.
	int input = mTfliteInterpreter->inputs()[0];
	mTfliteInterpreter->ResizeInputTensor(input, shape);
	if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Failed to allocate tensors!");
	return false;
	}
	return true;
	}

	bool BenchmarkModel::runInference() {
	auto status = mTfliteInterpreter->Invoke();
	auto nnapi_errno = mTfliteNnapiDelegate
	? mTfliteNnapiDelegate->GetNnApiErrno()
	: ANEURALNETWORKS_NO_ERROR;
	if (status != kTfLiteOk \|\| nnapi_errno != ANEURALNETWORKS_NO_ERROR) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Failed to invoke, tflite status: %d, nnapi errno: %d!",
	(int)status, nnapi_errno);
	return false;
	}
	return true;
	}

	bool BenchmarkModel::resetStates() {
	auto status = mTfliteInterpreter->ResetVariableTensors();
	if (status != kTfLiteOk) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Failed to reset variable tensors: %d!", (int)status);
	return false;
	}
	return true;
	}

	bool BenchmarkModel::benchmark(
	const std::vector<InferenceInOutSequence>& inOutData,
	int seqInferencesMaxCount, float timeout, int flags,
	std::vector<InferenceResult>* results) {
	if (inOutData.empty()) {
	__android_log_print(ANDROID_LOG_WARN, LOG_TAG,
	"Input/output vector is empty");
	return true;
	}

	float inferenceTotal = 0.0;
	for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount;
	++seqInferenceIndex) {
	resetStates();

	const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size();
	const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex];
	for (int i = 0; i < seq.size(); ++i) {
	const InferenceInOut& data = seq[i];

	// For NNAPI systrace usage documentation, see
	// frameworks/ml/nn/common/include/Tracing.h.
	kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark");
	kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input");
	if (data.input) {
	setInput(data.input, data.input_size);
	} else {
	int input = mTfliteInterpreter->inputs()[0];
	auto* input_tensor = mTfliteInterpreter->tensor(input);
	if (!data.createInput((uint8_t*)input_tensor->data.raw,
	input_tensor->bytes)) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Input creation %d failed", i);
	return false;
	}
	}
	kTraceFunc.ATrace_endSection();
	long long startTime = currentTimeInUsec();
	const bool success = runInference();
	kTraceFunc.ATrace_endSection();
	long long endTime = currentTimeInUsec();
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
	i);
	return false;
	}

	float inferenceTime =
	static_cast<float>(endTime - startTime) / 1000000.0f;
	size_t outputsCount = mTfliteInterpreter->outputs().size();
	InferenceResult result{
	inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i};
	result.meanSquareErrors.resize(outputsCount);
	result.maxSingleErrors.resize(outputsCount);
	result.inferenceOutputs.resize(outputsCount);

	if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) {
	if (outputsCount != data.outputs.size()) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Golden/actual outputs (%zu/%zu) count mismatch",
	data.outputs.size(), outputsCount);
	return false;
	}
	for (int j = 0; j < outputsCount; ++j) {
	getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j);
	}
	}

	if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) {
	for (int j = 0; j < outputsCount; ++j) {
	saveInferenceOutput(&result, j);
	}
	}
	results->push_back(result);
	inferenceTotal += inferenceTime;
	}

	// Timeout?
	if (timeout > 0.001 && inferenceTotal > timeout) {
	return true;
	}
	}
	return true;
	}

	// If cacheDir is not nullptr, compilation caching will be used with NNAPI.
	bool BenchmarkModel::runCompilation(const char* cacheDir) {
	std::unique_ptr<tflite::Interpreter> interpreter;
	tflite::ops::builtin::BuiltinOpResolver resolver;
	tflite::InterpreterBuilder(*mTfliteModel, resolver)(&interpreter);
	if (!interpreter) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to create TFlite interpreter");
	return false;
	}

	// Allow Fp16 precision for all models
	interpreter->SetAllowFp16PrecisionForFp32(true);

	if (mUseNnApi) {
	tflite::StatefulNnApiDelegate::Options nnapi_options;
	nnapi_options.accelerator_name = mNnApiDeviceName.empty() ? nullptr : mNnApiDeviceName.c_str();
	if (cacheDir) {
	nnapi_options.cache_dir = cacheDir;
	nnapi_options.model_token = mModelFile.c_str();
	}
	tflite::StatefulNnApiDelegate delegate(nnapi_options);
	int delegationStatus = interpreter->ModifyGraphWithDelegate(&delegate);
	auto nnapiErrno = delegate.GetNnApiErrno();
	if (delegationStatus != kTfLiteOk \|\| nnapiErrno != ANEURALNETWORKS_NO_ERROR) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d",
	mModelFile.c_str(), nnapiErrno);
	return false;
	}
	}
	return true;
	}

	// A helper class to manage the lifetime of a temporary cache directory.
	class ScopedTempDirectory {
	public:
	ScopedTempDirectory(std::string base) : mBase(std::move(base)) {}
	~ScopedTempDirectory() { cleanup(); }

	// Create a new temp directory, remove the old one if needed.
	void recreate() {
	cleanup();
	mTempDir = mBase + "/XXXXXX";
	mkdtemp(&mTempDir[0]);
	}

	// Get the path to the temp directory.
	const char* get() const { return mTempDir.empty() ? nullptr : mTempDir.c_str(); }

	private:
	void cleanup() {
	if (mTempDir.empty()) {
	return;
	}
	auto callback = [](const char* entry, const struct stat, int, struct FTW) {
	return remove(entry);
	};
	nftw(mTempDir.c_str(), callback, 128, FTW_DEPTH \| FTW_MOUNT \| FTW_PHYS);
	mTempDir.clear();
	}

	std::string mBase;
	std::string mTempDir;
	};

	bool BenchmarkModel::getCompilationCacheSize(int* cacheSizeBytes) {
	if (cacheSizeBytes == nullptr) return false;

	// Create cache files.
	ScopedTempDirectory tempDir(mCacheDir.value());
	tempDir.recreate();
	const bool success = runCompilation(tempDir.get());
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed");
	return false;
	}

	// Compute total size of cache files.
	int totalSize = 0;
	DIR* dir = opendir(tempDir.get());
	if (dir == nullptr) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to open cache directory");
	return false;
	}
	struct dirent* dp = nullptr;
	while ((dp = readdir(dir)) != nullptr) {
	char fullPath[1024];
	snprintf(fullPath, 1024, "%s/%s", tempDir.get(), dp->d_name);
	struct stat st;
	int err = stat(fullPath, &st);
	if (err != 0) {
	closedir(dir);
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to stat %s", fullPath);
	return false;
	}
	// Only accumulate sizes of regular files. This will exclude '.' and '..'.
	if (S_ISREG(st.st_mode)) {
	totalSize += st.st_size;
	}
	}
	closedir(dir);
	*cacheSizeBytes = totalSize;
	return true;
	}

	bool BenchmarkModel::benchmarkSingleTypeOfCompilation(CompilationBenchmarkType type,
	int maxNumIterations, float timeout,
	std::vector<float>* results) {
	if (results != nullptr) {
	results->clear();
	}
	ScopedTempDirectory tempDir(mCacheDir.value());

	// Initialize cache files to benchmark cache hit.
	if (type == CompilationBenchmarkType::PREPARE_FROM_CACHE) {
	tempDir.recreate();
	const bool success = runCompilation(tempDir.get());
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed");
	return false;
	}
	}

	float compilationTotal = 0.0;
	for (int i = 0; i < maxNumIterations; i++) {
	const char* cacheDir = nullptr;
	switch (type) {
	case CompilationBenchmarkType::WITHOUT_CACHE:
	cacheDir = nullptr;
	break;
	case CompilationBenchmarkType::SAVE_TO_CACHE:
	// Remove the cache files from the last iteration to benchmark cache miss.
	tempDir.recreate();
	[[fallthrough]];
	case CompilationBenchmarkType::PREPARE_FROM_CACHE:
	cacheDir = tempDir.get();
	break;
	default:
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Unknown CompilationBenchmarkType: %d",
	static_cast<int>(type));
	return false;
	}

	kTraceFunc.ATrace_beginSection("[NN_LA_PC]BenchmarkModel::benchmarkCompilation");
	const long long startTime = currentTimeInUsec();
	const bool success = runCompilation(cacheDir);
	const long long endTime = currentTimeInUsec();
	kTraceFunc.ATrace_endSection();
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Compilation %d failed", i);
	return false;
	}

	const float compilationTime = static_cast<float>(endTime - startTime) / 1000000.0f;
	if (results != nullptr) {
	results->push_back(compilationTime);
	}

	// Timeout?
	compilationTotal += compilationTime;
	if (timeout > 0.001 && compilationTotal > timeout) {
	return true;
	}
	}
	return true;
	}

	bool BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup(CompilationBenchmarkType type,
	int maxNumIterations,
	float warmupTimeout,
	float runTimeout,
	std::vector<float>* results) {
	kTraceFunc.ATrace_beginSection(
	"[NN_LA_PWM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup");
	bool success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, warmupTimeout, nullptr);
	kTraceFunc.ATrace_endSection();
	if (!success) return false;

	kTraceFunc.ATrace_beginSection(
	"[NN_LA_PBM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup");
	success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, runTimeout, results);
	kTraceFunc.ATrace_endSection();
	return success;
	}

	bool BenchmarkModel::benchmarkCompilation(int maxNumIterations, float warmupTimeout,
	float runTimeout, CompilationBenchmarkResult* result) {
	if (result == nullptr) return false;

	// Benchmark compile without cache.
	bool success = benchmarkSingleTypeOfCompilationWithWarmup(
	CompilationBenchmarkType::WITHOUT_CACHE, maxNumIterations, warmupTimeout, runTimeout,
	&result->compileWithoutCacheTimeSec);
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"Failed to benchmark compilation without cache");
	return false;
	}

	// Get compilation cache size.
	success = getCompilationCacheSize(&result->cacheSizeBytes);
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to retrieve compilation cache size");
	return false;
	}

	// Benchmark saving to cache and preparing from cache only if supported.
	if (result->cacheSizeBytes > 0) {
	// Benchmark saving to cache.
	auto& saveToCacheTimeSec = result->saveToCacheTimeSec.emplace();
	success = benchmarkSingleTypeOfCompilationWithWarmup(
	CompilationBenchmarkType::SAVE_TO_CACHE, maxNumIterations, warmupTimeout, runTimeout,
	&saveToCacheTimeSec);
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark saving to cache");
	return false;
	}

	// Benchmark preparing from cache.
	auto& prepareFromCacheTimeSec = result->prepareFromCacheTimeSec.emplace();
	success = benchmarkSingleTypeOfCompilationWithWarmup(
	CompilationBenchmarkType::PREPARE_FROM_CACHE, maxNumIterations, warmupTimeout,
	runTimeout, &prepareFromCacheTimeSec);
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark preparing from cache");
	return false;
	}
	}
	return result;
	}

	bool BenchmarkModel::dumpAllLayers(
	const char* path, const std::vector<InferenceInOutSequence>& inOutData) {
	if (inOutData.empty()) {
	FATAL("Input/output vector is empty");
	}

	for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size();
	++seqInferenceIndex) {
	resetStates();

	const InferenceInOutSequence& seq = inOutData[seqInferenceIndex];
	for (int i = 0; i < seq.size(); ++i) {
	const InferenceInOut& data = seq[i];
	setInput(data.input, data.input_size);
	const bool success = runInference();
	if (!success) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
	i);
	return false;
	}

	// The order of the tensor is not sorted by the tensor index
	for (int tensor_order = 0; tensor_order < outputs.size(); ++tensor_order) {
	int tensor_index = outputs[tensor_order];
	auto* output_tensor = mTfliteInterpreter->tensor(tensor_index);
	if (output_tensor->data.raw == nullptr) {
	__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
	"output_tensor->data.raw == nullptr at index %d ", tensor_index);
	continue;
	}
	char fullpath[1024];
	snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_order_%.3d_tensor_%.3d", path,
	seqInferenceIndex, i, tensor_order, tensor_index);
	FILE* f = fopen(fullpath, "wb");
	fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f);
	fclose(f);
	}
	}
	}
	return true;
	}