tensorflow/core/tpu/tpu_on_demand_compiler.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>

 #include "absl/cleanup/cleanup.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executable.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_id.h"

 namespace xla {

 namespace {

 using ::tensorflow::tpu::ExecutorApiFn;

 class TpuCompiler : public Compiler {
  public:
   TpuCompiler() { compiler_ = ExecutorApiFn()->TpuCompiler_NewFn(); }
   ~TpuCompiler() override { ExecutorApiFn()->TpuCompiler_FreeFn(compiler_); }

   stream_executor::Platform::Id PlatformId() const override {
     return tensorflow::tpu::GetTpuPlatformId();
   }

   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
       stream_executor::StreamExecutor* executor,
       const CompileOptions& options) override {
     XLA_HloModule hlo_module;
     auto cleanup = absl::MakeCleanup([&hlo_module]() {
       stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
       ApiConverter::Free(&hlo_module.module_config);
     });
     hlo_module.module_config = ApiConverter::ToC(module->config());
     hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
     auto allocator = ApiConverter::ToC(options.device_allocator);
     XLA_HloModule result;
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunHloPassesFn(
         compiler_, &hlo_module,
         static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
       return status.status();
     }
     HloModuleProto result_proto =
         stream_executor::tpu::DeserializeProto<HloModuleProto>(result.proto);
     stream_executor::tpu::SerializedProto_Free(result.proto);
     return HloModule::CreateFromProto(result_proto, module->config());
   }

   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
       stream_executor::StreamExecutor* executor,
       const CompileOptions& options) override {
     XLA_HloModule hlo_module;
     auto cleanup = absl::MakeCleanup([&hlo_module]() {
       stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
       ApiConverter::Free(&hlo_module.module_config);
     });
     SE_Executable* result;
     hlo_module.module_config = ApiConverter::ToC(module->config());
     hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
     auto allocator = ApiConverter::ToC(options.device_allocator);

     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunBackendFn(
         compiler_, &hlo_module,
         static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
       return status.status();
     }

     std::unique_ptr<Executable> exec =
         absl::make_unique<TpuExecutable>(result, std::move(module));
     return exec;
   }

   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<stream_executor::StreamExecutor*>> stream_exec,
       const CompileOptions& options) override {
     XLA_HloModuleGroup se_module_group;
     se_module_group.proto =
         stream_executor::tpu::SerializeProto(module_group->ToProto());
     se_module_group.module_config =
         new XLA_HloModuleConfig[module_group->size()];
     int module_group_size = module_group->size();
     auto cleanup_config =
         absl::MakeCleanup([&se_module_group, module_group_size]() {
           for (auto i = 0; i < module_group_size; ++i) {
             ApiConverter::Free(&se_module_group.module_config[i]);
           }
           delete[] se_module_group.module_config;
         });
     for (int i = 0; i < module_group->size(); ++i) {
       const auto& config = module_group->module(i).config();
       se_module_group.module_config[i] = ApiConverter::ToC(config);
     }
     std::vector<SE_StreamExecutorList> se_lists(stream_exec.size());
     std::vector<std::vector<SE_StreamExecutor*>> se_lists_storage;
     for (int i = 0; i < stream_exec.size(); ++i) {
       se_lists[i].count = stream_exec[i].size();
       se_lists_storage.emplace_back(stream_exec[i].size());
       se_lists[i].exec = se_lists_storage.back().data();
       for (int j = 0; j < stream_exec[i].size(); ++j) {
         se_lists[i].exec[j] = static_cast<tensorflow::tpu::TpuExecutor*>(
                                   stream_exec[i][j]->implementation())
                                   ->se_executor();
       }
     }

     SE_DeviceMemoryAllocator allocator =
         ApiConverter::ToC(options.device_allocator);

     SE_Executable** se_executables = new SE_Executable*[module_group->size()];

     StatusHelper status;

     ExecutorApiFn()->TpuCompiler_CompileFn(
         compiler_, &se_module_group, se_lists.data(), stream_exec.size(),
         &allocator, se_executables, status.c_status);

     if (!status.ok()) {
       return status.status();
     }

     std::vector<std::unique_ptr<Executable>> executables;
     for (int i = 0; i < module_group->size(); ++i) {
       // We get the HloModule from the compiled executable, rather than reusing
       // the input module from 'module_group', in case the module changed in
       // some way. For example, if the computation is automatically partitioned
       // via XLA, the executable's module may have different input/output shapes
       // than the input module.
       XLA_HloModule c_module =
           ExecutorApiFn()->TpuExecutable_HloModuleFn(se_executables[i]);
       auto cleanup_c_module =
           absl::MakeCleanup([&c_module]() { ApiConverter::Free(&c_module); });
       TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                           ApiConverter::FromC(c_module));
       std::shared_ptr<HloModule> module_shared(module.release());
       executables.emplace_back(absl::make_unique<TpuExecutable>(
           se_executables[i], std::move(module_shared)));
     }

     stream_executor::tpu::SerializedProto_Free(se_module_group.proto);
     delete[] se_executables;

     return executables;
   }

   // Compiles the HLO module group for ahead-of-time execution.  This is
   // intended for use in static compilation.
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      const AotCompilationOptions& options) override {
     return Unimplemented("This compiler does not support CompileAheadOfTime.");
   }

   // Returns a function that computes the size in bytes of the logical
   // buffer that contains a shape.
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
     return [this](const xla::Shape& shape) {
       XLA_Shape c_shape;
       ApiConverter::ToC(shape, &c_shape);
       int64_t bytes =
           ExecutorApiFn()->TpuCompiler_ShapeSizeFn(compiler_, &c_shape);
       ApiConverter::Free(&c_shape);
       return bytes;
     };
   }

  private:
   Tpu_Compiler* compiler_;
 };

 static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(
       tensorflow::tpu::GetTpuPlatformId(),
       []() { return absl::make_unique<TpuCompiler>(); });
   return true;
 }

 static bool module_initialized = InitModule();

 }  // namespace
 }  // namespace xla
	/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/
	#include <memory>

	#include "absl/cleanup/cleanup.h"
	#include "absl/types/span.h"
	#include "tensorflow/compiler/xla/service/compiler.h"
	#include "tensorflow/compiler/xla/service/executable.h"
	#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
	#include "tensorflow/compiler/xla/service/hlo_module.h"
	#include "tensorflow/compiler/xla/service/hlo_module_group.h"
	#include "tensorflow/compiler/xla/service/shaped_buffer.h"
	#include "tensorflow/compiler/xla/shape_util.h"
	#include "tensorflow/compiler/xla/util.h"
	#include "tensorflow/compiler/xla/xla_data.pb.h"
	#include "tensorflow/stream_executor/device_memory_allocator.h"
	#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
	#include "tensorflow/stream_executor/tpu/c_api_decl.h"
	#include "tensorflow/stream_executor/tpu/proto_helper.h"
	#include "tensorflow/stream_executor/tpu/status_helper.h"
	#include "tensorflow/stream_executor/tpu/tpu_executable.h"
	#include "tensorflow/stream_executor/tpu/tpu_executor.h"
	#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
	#include "tensorflow/stream_executor/tpu/tpu_platform.h"
	#include "tensorflow/stream_executor/tpu/tpu_platform_id.h"

	namespace xla {

	namespace {

	using ::tensorflow::tpu::ExecutorApiFn;

	class TpuCompiler : public Compiler {
	public:
	TpuCompiler() { compiler_ = ExecutorApiFn()->TpuCompiler_NewFn(); }
	~TpuCompiler() override { ExecutorApiFn()->TpuCompiler_FreeFn(compiler_); }

	stream_executor::Platform::Id PlatformId() const override {
	return tensorflow::tpu::GetTpuPlatformId();
	}

	StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
	std::unique_ptr<HloModule> module,
	stream_executor::StreamExecutor* executor,
	const CompileOptions& options) override {
	XLA_HloModule hlo_module;
	auto cleanup = absl::MakeCleanup([&hlo_module]() {
	stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
	ApiConverter::Free(&hlo_module.module_config);
	});
	hlo_module.module_config = ApiConverter::ToC(module->config());
	hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
	auto allocator = ApiConverter::ToC(options.device_allocator);
	XLA_HloModule result;
	StatusHelper status;
	ExecutorApiFn()->TpuCompiler_RunHloPassesFn(
	compiler_, &hlo_module,
	static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
	->se_executor(),
	&allocator, &result, status.c_status);
	if (!status.ok()) {
	return status.status();
	}
	HloModuleProto result_proto =
	stream_executor::tpu::DeserializeProto<HloModuleProto>(result.proto);
	stream_executor::tpu::SerializedProto_Free(result.proto);
	return HloModule::CreateFromProto(result_proto, module->config());
	}

	StatusOr<std::unique_ptr<Executable>> RunBackend(
	std::unique_ptr<HloModule> module,
	stream_executor::StreamExecutor* executor,
	const CompileOptions& options) override {
	XLA_HloModule hlo_module;
	auto cleanup = absl::MakeCleanup([&hlo_module]() {
	stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
	ApiConverter::Free(&hlo_module.module_config);
	});
	SE_Executable* result;
	hlo_module.module_config = ApiConverter::ToC(module->config());
	hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
	auto allocator = ApiConverter::ToC(options.device_allocator);

	StatusHelper status;
	ExecutorApiFn()->TpuCompiler_RunBackendFn(
	compiler_, &hlo_module,
	static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
	->se_executor(),
	&allocator, &result, status.c_status);
	if (!status.ok()) {
	return status.status();
	}

	std::unique_ptr<Executable> exec =
	absl::make_unique<TpuExecutable>(result, std::move(module));
	return exec;
	}

	StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
	std::unique_ptr<HloModuleGroup> module_group,
	std::vector<std::vector<stream_executor::StreamExecutor*>> stream_exec,
	const CompileOptions& options) override {
	XLA_HloModuleGroup se_module_group;
	se_module_group.proto =
	stream_executor::tpu::SerializeProto(module_group->ToProto());
	se_module_group.module_config =
	new XLA_HloModuleConfig[module_group->size()];
	int module_group_size = module_group->size();
	auto cleanup_config =
	absl::MakeCleanup([&se_module_group, module_group_size]() {
	for (auto i = 0; i < module_group_size; ++i) {
	ApiConverter::Free(&se_module_group.module_config[i]);
	}
	delete[] se_module_group.module_config;
	});
	for (int i = 0; i < module_group->size(); ++i) {
	const auto& config = module_group->module(i).config();
	se_module_group.module_config[i] = ApiConverter::ToC(config);
	}
	std::vector<SE_StreamExecutorList> se_lists(stream_exec.size());
	std::vector<std::vector<SE_StreamExecutor*>> se_lists_storage;
	for (int i = 0; i < stream_exec.size(); ++i) {
	se_lists[i].count = stream_exec[i].size();
	se_lists_storage.emplace_back(stream_exec[i].size());
	se_lists[i].exec = se_lists_storage.back().data();
	for (int j = 0; j < stream_exec[i].size(); ++j) {
	se_lists[i].exec[j] = static_cast<tensorflow::tpu::TpuExecutor*>(
	stream_exec[i][j]->implementation())
	->se_executor();
	}
	}

	SE_DeviceMemoryAllocator allocator =
	ApiConverter::ToC(options.device_allocator);

	SE_Executable** se_executables = new SE_Executable*[module_group->size()];

	StatusHelper status;

	ExecutorApiFn()->TpuCompiler_CompileFn(
	compiler_, &se_module_group, se_lists.data(), stream_exec.size(),
	&allocator, se_executables, status.c_status);

	if (!status.ok()) {
	return status.status();
	}

	std::vector<std::unique_ptr<Executable>> executables;
	for (int i = 0; i < module_group->size(); ++i) {
	// We get the HloModule from the compiled executable, rather than reusing
	// the input module from 'module_group', in case the module changed in
	// some way. For example, if the computation is automatically partitioned
	// via XLA, the executable's module may have different input/output shapes
	// than the input module.
	XLA_HloModule c_module =
	ExecutorApiFn()->TpuExecutable_HloModuleFn(se_executables[i]);
	auto cleanup_c_module =
	absl::MakeCleanup([&c_module]() { ApiConverter::Free(&c_module); });
	TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
	ApiConverter::FromC(c_module));
	std::shared_ptr<HloModule> module_shared(module.release());
	executables.emplace_back(absl::make_unique<TpuExecutable>(
	se_executables[i], std::move(module_shared)));
	}

	stream_executor::tpu::SerializedProto_Free(se_module_group.proto);
	delete[] se_executables;

	return executables;
	}

	// Compiles the HLO module group for ahead-of-time execution. This is
	// intended for use in static compilation.
	StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
	CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
	const AotCompilationOptions& options) override {
	return Unimplemented("This compiler does not support CompileAheadOfTime.");
	}

	// Returns a function that computes the size in bytes of the logical
	// buffer that contains a shape.
	HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
	return [this](const xla::Shape& shape) {
	XLA_Shape c_shape;
	ApiConverter::ToC(shape, &c_shape);
	int64_t bytes =
	ExecutorApiFn()->TpuCompiler_ShapeSizeFn(compiler_, &c_shape);
	ApiConverter::Free(&c_shape);
	return bytes;
	};
	}

	private:
	Tpu_Compiler* compiler_;
	};

	static bool InitModule() {
	xla::Compiler::RegisterCompilerFactory(
	tensorflow::tpu::GetTpuPlatformId(),
	[]() { return absl::make_unique<TpuCompiler>(); });
	return true;
	}

	static bool module_initialized = InitModule();

	} // namespace
	} // namespace xla