| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" |
| |
| #include <algorithm> |
| #include <cstdint> |
| #include <memory> |
| #include <set> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "absl/cleanup/cleanup.h" |
| #include "absl/container/flat_hash_map.h" |
| #include "absl/memory/memory.h" |
| #include "absl/synchronization/mutex.h" |
| #include "tensorflow/compiler/xla/map_util.h" |
| #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" |
| #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" |
| #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h" |
| #include "tensorflow/compiler/xla/service/gpu/gpu_types.h" |
| #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h" |
| #include "tensorflow/compiler/xla/service/hlo_instruction.h" |
| #include "tensorflow/compiler/xla/service/hlo_parser.h" |
| #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" |
| #include "tensorflow/compiler/xla/service/logical_buffer.h" |
| #include "tensorflow/compiler/xla/service/shaped_buffer.h" |
| #include "tensorflow/compiler/xla/service/transfer_manager.h" |
| #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h" |
| #include "tensorflow/compiler/xla/shape_tree.h" |
| #include "tensorflow/compiler/xla/shape_util.h" |
| #include "tensorflow/compiler/xla/status_macros.h" |
| #include "tensorflow/compiler/xla/util.h" |
| #include "tensorflow/core/lib/gtl/map_util.h" |
| #include "tensorflow/core/platform/casts.h" |
| #include "tensorflow/core/platform/errors.h" |
| #include "tensorflow/core/platform/logging.h" |
| #include "tensorflow/core/profiler/lib/scoped_annotation.h" |
| #include "tensorflow/core/profiler/lib/traceme.h" |
| #include "tensorflow/stream_executor/platform.h" |
| |
| #if XLA_ENABLE_XLIR |
| #include "llvm/Support/SourceMgr.h" |
| #include "mlir/Dialect/Func/IR/FuncOps.h" // from @llvm-project |
| #include "mlir/IR/Builders.h" // from @llvm-project |
| #include "mlir/IR/Diagnostics.h" // from @llvm-project |
| #include "tensorflow/compiler/mlir/utils/name_utils.h" |
| #include "tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h" |
| #include "tensorflow/compiler/xla/service/gpu/xlir_ops.h" |
| #include "tensorflow/stream_executor/gpu/gpu_executor.h" |
| #include "tensorflow/stream_executor/gpu/gpu_stream.h" |
| #include "tfrt/gpu/gpu_executor.h" // from @tf_runtime |
| #include "tfrt/gpu/gpu_types.h" // from @tf_runtime |
| #include "tfrt/jitrt/jitrt.h" // from @tf_runtime |
| #include "tfrt/jitrt/jitrt_compiler.h" // from @tf_runtime |
| #include "tfrt/bef/bef_buffer.h" // from @tf_runtime |
| #include "tfrt/bef_converter/bef_to_mlir.h" // from @tf_runtime |
| #include "tfrt/bef_executor/bef_file.h" // from @tf_runtime |
| #include "tfrt/core_runtime/core_runtime.h" // from @tf_runtime |
| #include "tfrt/host_context/async_dispatch.h" // from @tf_runtime |
| #include "tfrt/host_context/chain.h" // from @tf_runtime |
| #include "tfrt/host_context/concurrent_work_queue.h" // from @tf_runtime |
| #include "tfrt/host_context/execution_context.h" // from @tf_runtime |
| #include "tfrt/host_context/function.h" // from @tf_runtime |
| #include "tfrt/host_context/host_allocator.h" // from @tf_runtime |
| #include "tfrt/host_context/host_context.h" // from @tf_runtime |
| #include "tfrt/init_tfrt_dialects.h" // from @tf_runtime |
| #endif // XLA_ENABLE_XLIR |
| |
| namespace xla { |
| namespace gpu { |
| |
| bool IsBefExecutableEnabled(const HloModuleConfig& config) { |
| #if !XLA_ENABLE_XLIR |
| CHECK(!config.debug_options().xla_gpu_bef_executable()) |
| << "Failed to enable BEF backend, because it was not compiled."; |
| #endif // !XLA_ENABLE_XLIR |
| return config.debug_options().xla_gpu_bef_executable(); |
| } |
| |
| bool IsBefThunkEnabled(const HloModuleConfig& config) { |
| #if !XLA_ENABLE_XLIR |
| CHECK(!config.debug_options().xla_gpu_bef_thunk()) |
| << "Failed to enable BEF backend, because it was not compiled."; |
| #endif // !XLA_ENABLE_XLIR |
| return config.debug_options().xla_gpu_bef_thunk(); |
| } |
| |
| bool IsJitRtExecutableEnabled(const HloModuleConfig& config) { |
| #if !XLA_ENABLE_XLIR |
| CHECK(!config.debug_options().xla_gpu_jitrt_executable()) |
| << "Failed to enable JitRt backend, because it was not compiled."; |
| #endif // !XLA_ENABLE_XLIR |
| return config.debug_options().xla_gpu_jitrt_executable(); |
| } |
| |
| namespace { |
| |
| using ::tensorflow::profiler::ScopedAnnotation; |
| |
| bool NeedsAsyncCommsStream(Thunk& thunk) { |
| switch (thunk.kind()) { |
| case Thunk::Kind::kNcclAllReduceStart: |
| case Thunk::Kind::kNcclAllReduceDone: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static std::string ModuleUniqueName(absl::string_view module_name, |
| const HloModule* module) { |
| std::string unique_id; |
| if (module != nullptr) { |
| unique_id = absl::StrCat("module.", module->unique_id(), "."); |
| } |
| return absl::StrCat(unique_id, module_name); |
| } |
| |
| } // namespace |
| |
| void GpuExecutable::BefBufferDeleter::operator()(uint8_t* ptr) const { |
| #if XLA_ENABLE_XLIR |
| tfrt::AlignedFree(ptr); |
| #else |
| LOG(FATAL) << "OwnedBefBuffer only supported with XLA_ENABLE_XLIR"; |
| #endif |
| } |
| |
| void GpuExecutable::GpuContextCacheDeleter::operator()( |
| tfrt::gpu::GpuContextCache* ptr) const { |
| #if XLA_ENABLE_XLIR |
| delete ptr; |
| #else |
| LOG(FATAL) << "OwnedGpuContextCache only supported with XLA_ENABLE_XLIR"; |
| #endif |
| } |
| |
| #if XLA_ENABLE_XLIR |
| struct GpuExecutable::BefExecutable { |
| private: |
| explicit BefExecutable(OwnedBefBuffer buffer, |
| OwnedGpuContextCache context_cache) |
| : bef_buffer(std::move(buffer)), |
| host_ctx(tfrt::gpu::CreateHostContext( |
| tfrt::gpu::GetDiagHandler(&mlir_ctx))) { |
| if (context_cache) { |
| gpu_ctx_cache = std::move(context_cache); |
| } else { |
| gpu_ctx_cache = OwnedGpuContextCache(new tfrt::gpu::GpuContextCache); |
| } |
| } |
| |
| Status Initialize() { |
| bef_file = |
| tfrt::BEFFile::Open({bef_buffer.get(), bef_buffer.get_deleter().size}, |
| host_ctx->GetKernelRegistry(), |
| host_ctx->diag_handler(), host_ctx->allocator()); |
| if (!bef_file) { |
| return InternalError("Failed to decode BEF buffer"); |
| } |
| |
| auto req_ctx = tfrt::RequestContextBuilder(host_ctx.get(), nullptr).build(); |
| if (!req_ctx) { |
| return tensorflow::errors::Internal(toString(req_ctx.takeError())); |
| } |
| tfrt::ExecutionContext exec_ctx(*req_ctx); |
| |
| auto expected_entry_point = tfrt::gpu::GetEntryPoint(*bef_file, exec_ctx); |
| if (!expected_entry_point) { |
| return tensorflow::errors::Internal( |
| toString(expected_entry_point.takeError())); |
| } |
| entry_point = *expected_entry_point; |
| |
| const auto& func_name = entry_point.function_name; |
| function = bef_file->GetFunction(func_name); |
| if (!function) { |
| return InternalError("Failed to get '%s' function", func_name); |
| } |
| |
| return Status::OK(); |
| } |
| |
| public: |
| static StatusOr<BefExecutable*> Create(OwnedBefBuffer buffer, |
| OwnedGpuContextCache context_cache) { |
| std::unique_ptr<BefExecutable> result( |
| new BefExecutable(std::move(buffer), std::move(context_cache))); |
| TF_RETURN_IF_ERROR(result->Initialize()); |
| return result.release(); |
| } |
| |
| OwnedBefBuffer bef_buffer; |
| mlir::MLIRContext mlir_ctx; |
| std::unique_ptr<tfrt::HostContext> host_ctx; |
| tfrt::RCReference<tfrt::BEFFile> bef_file; |
| tfrt::gpu::EntryPoint entry_point; |
| // Signature: (chain, stream, inputs..., outputs...) -> (chain). |
| const tfrt::Function* function; |
| OwnedGpuContextCache gpu_ctx_cache; |
| }; |
| |
| namespace jitrt = ::tfrt::jitrt; |
| |
| class GpuExecutable::JitRtExecutable { |
| public: |
| static StatusOr<JitRtExecutable*> Create(OwnedJitRtProgram program) { |
| // Options for the default JitRt compilation pipeline. |
| jitrt::CompilationPipelineOptions copts; |
| // We do not expect any parallel loops on the GPU program, so we disable |
| // all concurrency (async parallel for loops). |
| copts.num_worker_threads = 1; |
| |
| // Options for constructing JitRt JitExecutable. |
| jitrt::CompilationOptions opts; |
| opts.specialization = jitrt::CompilationOptions::Specialization::kDisabled; |
| opts.register_dialects = jitrt::RegisterDefaultJitRtDialects; |
| |
| // Register JitRt Gpu runtime custom calls with the linker. |
| opts.runtime_symbol_map = JitRtCustomCallsSymbolMap; |
| |
| // We just use the default compilation pipeline provided by the JitRt. |
| // Alternatively instead of having a separate JitRtProgram (LMHLO lowered to |
| // JitRt dialects), we can assemble a pipeline that will compile starting |
| // from the LMHLO dialect. However this intermediate step helps with |
| // debugging, by materializing IR with XLA runtime custom calls. |
| opts.create_compilation_pipeline = [copts](mlir::PassManager& pm) { |
| jitrt::CreateDefaultJitRtCompilationPipeline(pm, copts); |
| }; |
| |
| // Instantiate new JitExecutable from the MLIR source. |
| auto jit_executable = jitrt::JitExecutable::Instantiate( |
| program->module, program->entry_point, opts); |
| if (auto err = jit_executable.takeError()) |
| return InternalError("Failed to compile JitRt program: %s", |
| tfrt::StrCat(err)); |
| |
| // Pass ownership to the GpuExecutable. |
| return new JitRtExecutable(std::move(program->buffer_sizes), |
| std::move(*jit_executable), |
| std::move(program->debug_options)); |
| } |
| |
| jitrt::JitExecutable& jit_executable() { return jit_executable_; } |
| jitrt::Executable& default_executable() { return *default_executable_; } |
| JitRtKernelsCache& kernels_cache() { return kernels_cache_; } |
| JitRtGemmConfigCache& gemm_configs_cache() { return gemm_configs_cache_; } |
| |
| // We pass a pointer to the buffer size to the compiled function, so we return |
| // a reference to a stable memory location. |
| const int64_t& buffer_size(size_t offset) const { |
| return buffer_sizes_[offset]; |
| } |
| |
| const DebugOptions& debug_options() const { return debug_options_; } |
| |
| private: |
| explicit JitRtExecutable(std::vector<int64_t> buffer_sizes, |
| jitrt::JitExecutable jit_executable, |
| DebugOptions debug_options) |
| : buffer_sizes_(std::move(buffer_sizes)), |
| jit_executable_(std::move(jit_executable)), |
| default_executable_(&jit_executable_.DefaultExecutable().get()), |
| debug_options_(std::move(debug_options)) {} |
| |
| std::vector<int64_t> buffer_sizes_; |
| jitrt::JitExecutable jit_executable_; |
| jitrt::Executable* default_executable_; // owned by `jit_executable` |
| DebugOptions debug_options_; |
| |
| // Keep a cache of kernels instantiated by this executable. |
| JitRtKernelsCache kernels_cache_; |
| |
| // Keep a cache of gemm configs for all gemm operation in the program. |
| JitRtGemmConfigCache gemm_configs_cache_; |
| }; |
| #endif // XLA_ENABLE_XLIR |
| |
| StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(Params params) { |
| auto executable = std::move(params.executable); |
| auto gpu_ctx_cache = std::move(params.gpu_ctx_cache); |
| std::unique_ptr<GpuExecutable> result(new GpuExecutable(std::move(params))); |
| |
| if (absl::holds_alternative<OwnedThunkSchedule>(executable)) { |
| result->thunks_ = std::move(absl::get<OwnedThunkSchedule>(executable)); |
| return result; |
| } |
| |
| #if XLA_ENABLE_XLIR |
| if (absl::holds_alternative<OwnedBefBuffer>(executable)) { |
| auto& bef_buffer = absl::get<OwnedBefBuffer>(executable); |
| TF_ASSIGN_OR_RETURN( |
| result->bef_executable_, |
| BefExecutable::Create(std::move(bef_buffer), std::move(gpu_ctx_cache))); |
| return result; |
| } |
| |
| if (absl::holds_alternative<OwnedJitRtProgram>(executable)) { |
| auto& program = absl::get<OwnedJitRtProgram>(executable); |
| TF_ASSIGN_OR_RETURN(result->jitrt_executable_, |
| JitRtExecutable::Create(std::move(program))); |
| return result; |
| } |
| #endif // XLA_ENABLE_XLIR |
| |
| return InternalError("No XLA gpu executable was provided"); |
| } |
| |
| // Implementation note: HLO profiling is always enabled for GPU executables, |
| // since we can use timers around thunks. |
| GpuExecutable::GpuExecutable(GpuExecutable::Params params) |
| : Executable(std::move(params.debug_module)), |
| text_(std::move(params.asm_text)), |
| binary_(std::move(params.binary)), |
| gpu_version_(params.gpu_version), |
| entry_func_attrs_(params.entry_func_attrs), |
| module_name_(params.module_name), |
| output_shape_(params.output_shape), |
| allocations_(std::move(params.allocations)), |
| debug_buffer_assignment_(std::move(params.debug_buffer_assignment)), |
| verbose_buffer_assignment_string_dumper_( |
| params.verbose_buffer_assignment_string_dumper), |
| constants_(std::move(params.constants)), |
| output_info_(std::move(params.output_info)) { |
| XlaDebugInfoManager::Get()->RegisterModule( |
| ModuleUniqueName(module_name_, shared_module().get()), shared_module(), |
| debug_buffer_assignment_); |
| } |
| |
| GpuExecutable::GpuExecutable( |
| std::shared_ptr<HloModule> hlo_module, GpuVersion gpu_version, |
| xla::EntryFunctionAttributes entry_func_attrs, |
| absl::string_view module_name, Shape xla_output_shape, |
| std::vector<BufferAllocation> allocations, |
| absl::flat_hash_map<ShapeIndex, OutputInfo> output_info, |
| BefExecutable* bef_executable) |
| : Executable(std::move(hlo_module)), |
| gpu_version_(gpu_version), |
| entry_func_attrs_(entry_func_attrs), |
| module_name_(module_name), |
| output_shape_(xla_output_shape), |
| allocations_(std::move(allocations)), |
| output_info_(std::move(output_info)), |
| bef_executable_(bef_executable) { |
| XlaDebugInfoManager::Get()->RegisterModule( |
| ModuleUniqueName(module_name_, shared_module().get()), shared_module(), |
| debug_buffer_assignment_); |
| } |
| |
| GpuExecutable::~GpuExecutable() { |
| XlaDebugInfoManager::Get()->UnregisterModule( |
| ModuleUniqueName(module_name_, shared_module().get()), shared_module(), |
| debug_buffer_assignment_); |
| |
| { |
| // We could have issued host->device mem copies in ResolveConstantGlobals. |
| // Wait for those to finish so that we can safely deallocate the backing HLO |
| // module. |
| // |
| // We need for the host->device memcpies to finish they are concurrently |
| // reading memory (xla::Literal's) owned by the HLO module. |
| absl::MutexLock lock(&module_handle_mutex_); |
| for (const auto& pair : module_globals_) { |
| CHECK(pair.first->SynchronizeAllActivity()); |
| } |
| } |
| |
| #if XLA_ENABLE_XLIR |
| delete bef_executable_; |
| delete jitrt_executable_; |
| #endif |
| } |
| |
| Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions( |
| const ServiceExecutableRunOptions* run_options) { |
| se::Stream* main_stream = run_options->stream(); |
| |
| stream_executor::PlatformKind platform_kind = |
| main_stream->parent()->platform_kind(); |
| if (platform_kind == stream_executor::PlatformKind::kROCm) { |
| auto cc = main_stream->GetRocmComputeCapability(); |
| std::string stream_arch = cc.gcn_arch_name(); |
| std::string gpu_exec_arch = |
| absl::get<se::RocmComputeCapability>(gpu_version_).gcn_arch_name(); |
| TF_RET_CHECK(stream_arch == gpu_exec_arch) |
| << "AMDGPU GCN ISA version mismatch; expected {" << gpu_exec_arch |
| << ", but was " << stream_arch; |
| } else if (platform_kind == stream_executor::PlatformKind::kCuda) { |
| GpuVersion cc = main_stream->GetCudaComputeCapability(); |
| TF_RET_CHECK(absl::get<se::CudaComputeCapability>(cc) == |
| absl::get<se::CudaComputeCapability>(gpu_version_)) |
| << "Compute capability mismatch; expected {" |
| << absl::get<se::CudaComputeCapability>(gpu_version_).ToString() |
| << "}, but was {" << absl::get<se::CudaComputeCapability>(cc).ToString() |
| << "}"; |
| } else { |
| return InternalError("Unknown platform: %d", platform_kind); |
| } |
| |
| return Status::OK(); |
| } |
| |
| namespace { |
| |
| Status MaybeSyncAndProfile(const ServiceExecutableRunOptions* run_options, |
| uint64_t start_micros, se::Stream* stream_to_sync); |
| |
| Status ExecuteThunks(const std::string& module_name, |
| const ThunkSchedule& thunk_schedule, |
| const ServiceExecutableRunOptions* run_options, |
| const BufferAllocations& buffer_allocations, |
| bool block_host_until_done) { |
| se::Stream* main_stream = run_options->stream(); |
| se::StreamExecutor* executor = main_stream->parent(); |
| |
| StatusOr<StreamPool::Ptr> async_comms_stream = |
| run_options->BorrowStream(executor->device_ordinal()); |
| |
| // Stream 0 indicates `main_stream` and substreams start from stream 1. |
| std::vector<StreamPool::Ptr> sub_streams; |
| sub_streams.reserve(thunk_schedule.StreamCount() - 1); |
| while (sub_streams.size() + 1 < thunk_schedule.StreamCount()) { |
| sub_streams.emplace_back(); |
| TF_ASSIGN_OR_RETURN(sub_streams.back(), |
| run_options->BorrowStream(executor->device_ordinal())); |
| // Require substreams to wait for the main stream, otherwise substreams may |
| // execute before the program is scheduled to start on the main stream. |
| sub_streams.back()->ThenWaitFor(main_stream); |
| } |
| |
| uint64_t start_micros = tensorflow::Env::Default()->NowMicros(); |
| |
| tensorflow::profiler::TraceMe hlo_module_activity( |
| [&] { return absl::StrCat(module_name, ":XLA GPU module"); }, |
| tensorflow::profiler::TraceMeLevel::kInfo); |
| |
| absl::flat_hash_map<const Thunk*, std::unique_ptr<se::Event>> |
| thunk_to_finish_event; |
| for (const std::unique_ptr<Thunk>& thunk : thunk_schedule.TotalOrder()) { |
| // Annotate execution of this op if tracing was enabled when we started |
| // running this module. If tracing is enabled *while* we're running the |
| // module, we won't get any data, but that's probably an OK trade-off. |
| ScopedAnnotation annotation([&] { return thunk->profile_annotation(); }); |
| |
| int32_t stream_no = thunk_schedule.StreamNumberForThunk(thunk.get()); |
| se::Stream* stream = |
| (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get()); |
| |
| for (const Thunk* dependency : thunk_schedule.DependsOn(thunk.get())) { |
| stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get()); |
| } |
| |
| VLOG(2) << "Executing the thunk for " << thunk->profile_annotation() |
| << " on stream " << stream_no; |
| |
| TF_RET_CHECK(async_comms_stream.ok() || !NeedsAsyncCommsStream(*thunk)) |
| << "`run_options` must have a stream borrower for async thunks."; |
| |
| Thunk::ExecuteParams thunk_params{ |
| *run_options, buffer_allocations, stream, |
| async_comms_stream.ok() ? async_comms_stream->get() : nullptr}; |
| TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(thunk_params)); |
| if (thunk_schedule.Depended(thunk.get())) { |
| auto finish_event = absl::make_unique<se::Event>(main_stream->parent()); |
| finish_event->Init(); |
| stream->ThenRecordEvent(finish_event.get()); |
| thunk_to_finish_event[thunk.get()] = std::move(finish_event); |
| } |
| } |
| |
| main_stream->ThenWaitFor(&sub_streams); |
| return MaybeSyncAndProfile(run_options, start_micros, |
| block_host_until_done ? main_stream : nullptr); |
| } |
| |
| Status MaybeSyncAndProfile(const ServiceExecutableRunOptions* run_options, |
| uint64_t start_micros, |
| se::Stream* stream_to_sync = nullptr) { |
| // Make sure kernels are completed before deallocating temporary buffers or |
| // the profiler state. |
| // TODO(b/30100571): we could potentially postpone deallocating the temp |
| // buffers until a different computation is executed. |
| if (stream_to_sync) { |
| Status block_status = stream_to_sync->BlockHostUntilDone(); |
| if (!block_status.ok()) { |
| return InternalError( |
| "Failed to complete all kernels launched on stream %p: %s", |
| stream_to_sync, block_status.error_message()); |
| } |
| } |
| |
| // FinishExecution() blocks until main_stream has completed if profiling is |
| // enabled; we therefore do not need to defer profile collection onto a |
| // stream. |
| uint64_t end_micros = tensorflow::Env::Default()->NowMicros(); |
| |
| if (run_options->run_options().execution_profile()) { |
| ExecutionProfile* profile = run_options->run_options().execution_profile(); |
| const double nanoseconds = (end_micros - start_micros) * 1000.0; |
| profile->set_compute_time_ns(std::max(nanoseconds, 1.0)); |
| } |
| |
| return Status::OK(); |
| } |
| |
| } // namespace |
| |
| StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*> |
| GpuExecutable::ResolveConstantGlobals(se::Stream* stream) { |
| se::StreamExecutor* executor = stream->parent(); |
| |
| absl::MutexLock lock(&module_handle_mutex_); |
| auto it = module_globals_.find(executor); |
| if (it != module_globals_.end()) { |
| return &it->second; |
| } |
| |
| se::MultiModuleLoaderSpec module_spec; |
| if (!binary().empty()) { |
| module_spec.AddCudaCubinInMemory(binary()); |
| } |
| module_spec.AddCudaPtxInMemory(text().c_str()); |
| |
| absl::flat_hash_map<int64_t, se::DeviceMemoryBase> globals; |
| se::ModuleHandle module_handle; |
| // The CUDA driver isn't able to load empty PTX. It's okay if we skip loading |
| // in this case; if the module isn't loaded, all symbol lookups will fail, |
| // just as they should for an empty module. |
| if (!(executor->platform_kind() == se::PlatformKind::kCuda && |
| module_spec.cuda_ptx_in_memory() == nullptr)) { |
| TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle)); |
| } |
| |
| for (const ConstantInfo& info : constants_) { |
| StatusOr<stream_executor::DeviceMemoryBase> global_status; |
| if (static_cast<bool>(module_handle)) { |
| global_status = |
| executor->GetUntypedSymbol(info.symbol_name, module_handle); |
| } |
| |
| se::DeviceMemoryBase global; |
| if (static_cast<bool>(module_handle) && global_status.ok()) { |
| // The constant was defined in the PTX and has been allocated by the CUDA |
| // driver. |
| global = *global_status; |
| VLOG(3) << "Resolved global " << info.symbol_name << " to " |
| << global.opaque(); |
| |
| if (!info.content.empty()) { |
| // This means the constant did not have an initializer in the PTX and |
| // therefore must be initialized by XLA here. |
| stream->ThenMemcpy(&global, info.content.data(), info.content.size()); |
| } |
| } else { |
| // The constant was not defined in the PTX and therefore must be both |
| // allocated and initialized by XLA here. |
| CHECK(!info.content.empty()); |
| |
| TF_ASSIGN_OR_RETURN( |
| auto shared, executor->CreateOrShareConstant(stream, info.content)); |
| global = *shared; |
| VLOG(3) << "Allocated (or shared) global " << info.symbol_name << " at " |
| << global.opaque(); |
| // XLA will continue to own this global at least until this executable is |
| // destroyed (longer if another, longer-lived executable shares the same |
| // constant). |
| shared_constants_.push_back(std::move(shared)); |
| } |
| |
| if (info.allocation_index != -1) { |
| InsertOrDie(&globals, info.allocation_index, global); |
| } |
| } |
| |
| module_handles_.emplace(executor, |
| se::ScopedModuleHandle(executor, module_handle)); |
| return &module_globals_.emplace(executor, std::move(globals)).first->second; |
| } |
| |
| StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation( |
| VariantArguments arguments, |
| const GpuExecutable::BufferAllocToDeviceMemoryMap* globals, |
| const BufferAllocation& allocation, |
| se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal, |
| int64_t arg_idx) { |
| if (allocation.is_thread_local()) { |
| return se::DeviceMemoryBase{}; |
| } else if (allocation.is_entry_computation_parameter()) { |
| int64_t param_no = allocation.parameter_number(); |
| se::DeviceMemoryBase registered_buffer = [&] { |
| if (auto unowned_shapedbuffers = |
| absl::get_if<absl::Span<const ShapedBuffer* const>>(&arguments)) { |
| return (*unowned_shapedbuffers)[param_no]->buffers().element( |
| allocation.param_shape_index()); |
| } else { |
| return absl::get<absl::Span<ExecutionInput>>(arguments)[param_no] |
| .Buffer(allocation.param_shape_index()) |
| .AsDeviceMemoryBase(); |
| } |
| }(); |
| if (registered_buffer.is_null() && registered_buffer.size() > 0) { |
| return FailedPrecondition( |
| "Cannot run XLA computation because pointer to (sub-)buffer at " |
| "index %s of parameter %d was null. All pointers to " |
| "(sub-)buffers must not be null, unless the (sub-)buffer has " |
| "zero elements.", |
| allocation.param_shape_index().ToString(), param_no); |
| } |
| return registered_buffer; |
| } else if (allocation.is_constant()) { |
| auto it = globals->find(arg_idx); |
| if (it == globals->end()) { |
| return se::DeviceMemoryBase(); |
| } |
| return it->second; |
| } else { |
| // Allocate each allocation that might escape, or is the temp buffer. |
| CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()); |
| const int64_t buffer_size = allocation.size(); |
| se::DeviceMemoryBase buffer_address; |
| if (buffer_size > 0) { |
| StatusOr<se::OwningDeviceMemory> buffer = |
| memory_allocator->Allocate(device_ordinal, buffer_size); |
| if (!buffer.ok()) { |
| return ResourceExhausted("%s\n%s\n", buffer.status().error_message(), |
| verbose_buffer_assignment_string_dumper_()); |
| } |
| buffer_address = buffer->Release(); |
| } |
| return buffer_address; |
| } |
| } |
| |
| static Status CheckAlignment(const BufferAllocation& allocation, |
| se::DeviceMemoryBase buffer, int arg_idx) { |
| const int64_t expected_alignment = [&] { |
| if (allocation.is_entry_computation_parameter()) { |
| return kEntryParameterAlignBytes; |
| } else if (allocation.is_constant()) { |
| return kConstantBufferAlignBytes; |
| } else { |
| return kXlaAllocatedBufferAlignBytes; |
| } |
| }(); |
| if (!buffer.is_null() && |
| reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment != 0) { |
| return InternalError( |
| "Address of buffer %d must be a multiple of %x, but " |
| "was %p", |
| arg_idx, expected_alignment, buffer.opaque()); |
| } |
| return Status::OK(); |
| } |
| |
| StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations( |
| VariantArguments arguments, |
| const GpuExecutable::BufferAllocToDeviceMemoryMap* globals, |
| se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal) { |
| tensorflow::profiler::TraceMe hlo_module_activity( |
| [&] { return std::string("Build buffer allocations"); }, |
| tensorflow::profiler::TraceMeLevel::kInfo); |
| |
| const int64_t num_buffers = allocations_.size(); |
| std::vector<se::DeviceMemoryBase> buffers; |
| buffers.reserve(num_buffers); |
| for (int64_t i = 0; i < num_buffers; ++i) { |
| const BufferAllocation& allocation = allocations_[i]; |
| TF_ASSIGN_OR_RETURN( |
| se::DeviceMemoryBase buffer, |
| BufferForAllocation(arguments, globals, allocation, memory_allocator, |
| device_ordinal, i)); |
| buffers.push_back(buffer); |
| TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i)); |
| } |
| return {{buffers, device_ordinal, memory_allocator}}; |
| } |
| |
| StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream( |
| const ServiceExecutableRunOptions* run_options, |
| std::vector<ExecutionInput> arguments, |
| HloExecutionProfile* hlo_execution_profile) { |
| return ExecuteAsyncOnStreamImpl(run_options, absl::MakeSpan(arguments)); |
| } |
| |
| StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream( |
| const ServiceExecutableRunOptions* run_options, |
| absl::Span<const ShapedBuffer* const> arguments, |
| HloExecutionProfile* hlo_execution_profile) { |
| TF_ASSIGN_OR_RETURN(ExecutionOutput out, |
| ExecuteAsyncOnStreamImpl(run_options, arguments)); |
| return out.ConsumeResult(); |
| } |
| |
| #if XLA_ENABLE_XLIR |
| // TODO(hanbinyoon): Deduplicate with that in bef_thunk.cc. |
| static tfrt::RCReference<tfrt::AsyncValue> CreateGpuBuffer( |
| stream_executor::DeviceMemoryBase* data) { |
| #if TENSORFLOW_USE_ROCM |
| auto platform = tfrt::gpu::wrapper::Platform::ROCm; |
| #else |
| auto platform = tfrt::gpu::wrapper::Platform::CUDA; |
| #endif |
| |
| tfrt::gpu::wrapper::Pointer<void> pointer(data->opaque(), platform); |
| auto allocator = |
| tfrt::MakeAvailableAsyncValueRef<tfrt::gpu::GpuOneShotAllocator<void>>( |
| pointer); |
| auto buffer = |
| tfrt::gpu::GpuBuffer::Allocate(std::move(allocator), data->size()); |
| if (!buffer) |
| return tfrt::MakeErrorAsyncValueRef(tfrt::StrCat(buffer.takeError())); |
| return tfrt::MakeAvailableAsyncValueRef<tfrt::gpu::GpuBuffer>( |
| std::move(*buffer)); |
| } |
| |
| // TODO(hanbinyoon): Deduplicate with that in bef_thunk.cc. |
| static StatusOr<std::unique_ptr<tfrt::ExecutionContext>> CreateExecutionContext( |
| const Thunk::ExecuteParams& params, |
| tfrt::RequestContextBuilder request_context_builder) { |
| TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id, |
| params.GetGlobalDeviceId()); |
| request_context_builder.context_data().emplace<XlaGpuParams>(XlaGpuParams{ |
| params.run_id, params.device_assn, params.gpu_global_device_ids, |
| params.nccl_unique_id_callback, global_device_id, |
| GetOrCreateInfeedManager(params.stream->parent()), |
| GetOrCreateOutfeedManager(params.stream->parent())}); |
| |
| auto expected_req_ctx = std::move(request_context_builder).build(); |
| if (!expected_req_ctx) { |
| auto error = expected_req_ctx.takeError(); |
| return tensorflow::errors::Internal(llvm::toString(std::move(error))); |
| } |
| return std::make_unique<tfrt::ExecutionContext>(std::move(*expected_req_ctx)); |
| } |
| |
| static Status ExecuteBef(const std::string& module_name, |
| GpuExecutable::BefExecutable* bef_executable, |
| const ServiceExecutableRunOptions* run_options, |
| const BufferAllocations& buffer_allocations, |
| size_t num_allocations, bool block_host_until_done) { |
| uint64_t start_micros = tensorflow::Env::Default()->NowMicros(); |
| |
| tensorflow::profiler::TraceMe hlo_module_activity( |
| [&] { return absl::StrCat(module_name, ":XLA GPU module"); }, |
| tensorflow::profiler::TraceMeLevel::kInfo); |
| |
| // TODO(hanbinyoon): Expand on the annotation. |
| ScopedAnnotation annotation("BefExecution"); |
| |
| se::gpu::GpuStream* stream = se::gpu::AsGpuStream(run_options->stream()); |
| auto gpu_context = bef_executable->gpu_ctx_cache->GetOrCreate( |
| se::gpu::GpuDriver::GetContextHandle(stream->parent()->gpu_context())); |
| auto gpu_stream = |
| tfrt::gpu::MakeBorrowedStream(gpu_context.first, stream->gpu_stream()); |
| |
| // Create execution context. |
| Thunk::ExecuteParams params(*run_options, buffer_allocations, |
| run_options->stream(), nullptr); |
| tfrt::RequestContextBuilder request_context_builder( |
| bef_executable->host_ctx.get(), gpu_context.second); |
| TF_ASSIGN_OR_RETURN( |
| std::unique_ptr<tfrt::ExecutionContext> exec_ctx, |
| CreateExecutionContext(params, std::move(request_context_builder))); |
| |
| // Create owning handles for arguments and add pointer to them to 'args'. |
| const tfrt::Function* function = bef_executable->function; |
| llvm::SmallVector<tfrt::AsyncValue*, 8> args; |
| args.reserve(function->num_arguments()); |
| tfrt::AsyncValueRef<tfrt::Chain> chain = tfrt::GetReadyChain(); |
| args.push_back(chain.GetAsyncValue()); |
| args.push_back(gpu_stream.get().value()); |
| llvm::SmallVector<tfrt::RCReference<tfrt::AsyncValue>, 8> buffers; |
| for (size_t i = 0; i < num_allocations; i++) { |
| auto input = buffer_allocations.GetDeviceAddress(i); |
| buffers.push_back(CreateGpuBuffer(&input)); |
| args.push_back(buffers.back().get()); |
| } |
| if (args.size() != function->num_arguments()) |
| return InternalError("Unexpected argument count."); |
| |
| // Create return chain. |
| tfrt::RCReference<tfrt::AsyncValue> result; |
| if (function->num_results() != 1) |
| return InternalError("Unexpected result count."); |
| |
| // Capture errors and augment with source. |
| std::string diag_str; |
| llvm::raw_string_ostream diag_os(diag_str); |
| llvm::SourceMgr src_mgr; |
| mlir::SourceMgrDiagnosticHandler handler(src_mgr, &bef_executable->mlir_ctx, |
| diag_os); |
| |
| // Execute the function. |
| function->Execute(*exec_ctx, args, {result}); |
| |
| // Wait for async results to be ready. |
| tfrt::Await(*exec_ctx, llvm::makeArrayRef(result)); |
| |
| // Report error if any, from handler and result. |
| if (diag_os.tell()) return tensorflow::errors::Internal(diag_os.str()); |
| if (auto* error = result->GetErrorIfPresent()) |
| return tensorflow::errors::Internal(tfrt::StrCat(*error)); |
| |
| return MaybeSyncAndProfile( |
| run_options, start_micros, |
| block_host_until_done ? run_options->stream() : nullptr); |
| } |
| |
| static Status ExecuteJitRt(const std::string& module_name, |
| GpuExecutable::JitRtExecutable* jitrt_executable, |
| const ServiceExecutableRunOptions* run_options, |
| const BufferAllocations& buffer_allocations, |
| size_t num_allocations, bool block_host_until_done) { |
| uint64_t start_micros = tensorflow::Env::Default()->NowMicros(); |
| |
| tensorflow::profiler::TraceMe hlo_module_activity( |
| [&] { return absl::StrCat(module_name, ":XLA GPU module"); }, |
| tensorflow::profiler::TraceMeLevel::kInfo); |
| |
| ScopedAnnotation annotation( |
| []() -> std::string { return "JitRtExecutable"; }); |
| |
| // TODO(ezhulenev): Here we rely on implementation details of passing memrefs |
| // to the compiled kernel. We should have a nicer API to do this, without |
| // creating a vector of temporary MemrefDesc for passing operands. |
| |
| // Pack buffer allocations as executable arguments. It is guaranteed that |
| // compiled function will make a copy of all arguments and will write all |
| // results after the call to `Execute` completes, so it is safe to keep in on |
| // the stack. |
| jitrt::Executable::CallFrame call_frame; |
| |
| // Each buffer allocation pased as 1d memref to the compiled kernel: |
| // {basePtr, dataPtr, offset, [sizes, ...], [strides, ...]} |
| size_t num_args_ptrs = 1 + num_allocations * 5; |
| call_frame.args.resize_for_overwrite(num_args_ptrs); |
| |
| // Pass pointers to these constants as a memref offset and stride. |
| int64_t zero = 0; |
| int64_t one = 1; |
| void* offset = &zero; |
| void* stride = &one; |
| |
| // Add a placeholder for the kernel context as the first argument. |
| call_frame.args[0] = nullptr; |
| |
| // Storage for data pointers. |
| llvm::SmallVector<void*, 16> ptrs; |
| ptrs.resize_for_overwrite(num_allocations); |
| |
| // Initialize arguments for the buffer operands. |
| for (unsigned i = 0; i < num_allocations; ++i) { |
| void* data = &(ptrs[i] = buffer_allocations.GetDeviceAddress(i).opaque()); |
| void* size = const_cast<int64_t*>(&jitrt_executable->buffer_size(i)); |
| unsigned idx = 1 + i * 5; |
| call_frame.args[idx + 0] = data; |
| call_frame.args[idx + 1] = data; |
| call_frame.args[idx + 2] = offset; |
| call_frame.args[idx + 3] = size; |
| call_frame.args[idx + 4] = stride; |
| } |
| |
| // JitRt executables do not return any values. |
| jitrt::NoOpReturnValueConverter converter; |
| |
| // Prepare options for executing JitRt program. |
| jitrt::Executable::ExecuteOpts opts; |
| |
| // We don't expect to see any async tasks in the JitRt executable. |
| opts.async_task_runner = |
| reinterpret_cast<jitrt::AsyncTaskRunner*>(0XDEADBEEF); |
| |
| // Pass auxiliary data to the custom call handlers. |
| jitrt::CustomCall::UserData user_data; |
| user_data.insert_all(run_options, &jitrt_executable->debug_options(), |
| &jitrt_executable->kernels_cache(), |
| &jitrt_executable->gemm_configs_cache()); |
| opts.custom_call_data = &user_data; |
| |
| // Get the default executable. We do not support specialization because |
| // all shapes are static. Default executable is guaranteed to be available. |
| jitrt::Executable& executable = jitrt_executable->default_executable(); |
| |
| // Execute with the prepared call frame. |
| executable.Execute(call_frame, opts); |
| if (auto err = executable.ReturnResults(converter, &call_frame)) |
| return InternalError("Failed to execute JitRt executable: %s.", |
| tfrt::StrCat(err)); |
| |
| return MaybeSyncAndProfile( |
| run_options, start_micros, |
| block_host_until_done ? run_options->stream() : nullptr); |
| } |
| #endif // XLA_ENABLE_XLIR |
| |
| StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl( |
| const ServiceExecutableRunOptions* run_options, |
| VariantArguments arguments) { |
| XLA_SCOPED_LOGGING_TIMER(absl::StrCat( |
| "GpuExecutable::ExecuteAsyncOnStreamImpl(", module_name_, ")")); |
| se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator(); |
| // Force synchronous execution if the allocator requires it. |
| const bool block_host_until_done = |
| !memory_allocator->AllowsAsynchronousDeallocation(); |
| |
| se::StreamExecutor* executor = run_options->stream()->parent(); |
| |
| // Lock the GPU with a shared lock so that we don't interfere with autotuning |
| // that may be running during JIT compilation while allowing multiple XLA |
| // computations to use the same GPU simultaneously. |
| absl::ReaderMutexLock gpu_lock(&GetGpuMutex(executor)); |
| |
| const GpuExecutable::BufferAllocToDeviceMemoryMap* globals; |
| { |
| tensorflow::profiler::TraceMe hlo_module_activity( |
| [&] { return std::string("Resolve constant globals"); }, |
| tensorflow::profiler::TraceMeLevel::kInfo); |
| |
| TF_ASSIGN_OR_RETURN(globals, ResolveConstantGlobals(run_options->stream())); |
| } |
| |
| auto device_ordinal = executor->device_ordinal(); |
| ExecutionOutput result(/*on_device_shape=*/output_shape_, memory_allocator, |
| device_ordinal); |
| |
| TF_ASSIGN_OR_RETURN( |
| BufferAllocations buffer_allocations, |
| GenerateBufferAllocations(arguments, globals, memory_allocator, |
| device_ordinal)); |
| VLOG(2) << buffer_allocations.ToString(); |
| std::set<se::DeviceMemoryBase> buffers_in_result; |
| |
| const bool is_entire_tuple_contents_aliased = [&] { |
| for (auto& p : result.MutableResult()->buffers().leaves()) { |
| if (!output_info_.contains(p.first)) { |
| continue; |
| } |
| const OutputInfo& output_info = output_info_.at(p.first); |
| if (!output_info.alias_config.has_value()) { |
| return false; |
| } |
| } |
| return true; |
| }(); |
| |
| for (auto& p : result.MutableResult()->buffers()) { |
| const ShapeIndex& index = p.first; |
| if (!output_info_.contains(index)) { |
| continue; |
| } |
| const OutputInfo& output_info = output_info_.at(index); |
| const BufferAllocation* allocation = |
| &allocations_[output_info.allocation_index]; |
| se::DeviceMemoryBase& result_buffer = p.second; |
| |
| VLOG(4) << "Looking at: allocation " << output_info.allocation_index |
| << " @ index: " << index.ToString(); |
| |
| if (output_info.alias_config) { |
| MaybeOwningDeviceMemory* maybe_owning_memory = |
| [&]() -> xla::MaybeOwningDeviceMemory* { |
| // ScopedBuffer is never an owned buffer. |
| if (auto* unowned_shapedbuffers = |
| absl::get_if<absl::Span<const ShapedBuffer* const>>( |
| &arguments)) { |
| return nullptr; |
| } else { |
| auto unowned_execution_input = |
| absl::get<absl::Span<ExecutionInput>>(arguments); |
| ExecutionInput& input = |
| unowned_execution_input[allocation->parameter_number()]; |
| return input.MutableBuffer(allocation->param_shape_index()); |
| } |
| }(); |
| if (output_info.alias_config->must_alias() && maybe_owning_memory && |
| !maybe_owning_memory->HasOwnership()) { |
| return InvalidArgument( |
| "An input was configured to be must-alias at " |
| "compile time but not donated at runtime: allocation %d", |
| output_info.allocation_index); |
| } |
| if (maybe_owning_memory && maybe_owning_memory->HasOwnership()) { |
| absl::optional<tensorflow::se::OwningDeviceMemory> owning = |
| maybe_owning_memory->Release(); |
| // If the caller passes the ownership of the device memory, reuse it |
| // as the output buffer. It is up to the caller whether or not to |
| // donate a buffer; the aliasing information describes which buffers |
| // may alias, not buffers that must alias. |
| se::DeviceMemoryBase argument_buffer = owning->Release(); |
| *maybe_owning_memory = argument_buffer; |
| result_buffer = argument_buffer; |
| // The caller is giving us the |
| // input buffer, but in case of error from the execute call, we should |
| // not be releasing it as it contains valid data (for example, it is a |
| // parameter which the user wants us to alias, in a gradient update |
| // computation). So we store the index into the result in the aliased |
| // vector, which will be fed to the ExecutionOutput, which will use |
| // the indices to drop the addresses from its own ScopedShapedBuffer |
| // result, if the ExecutionOutput is not committed. |
| result.AddAliasedIndex(index); |
| } else if (!output_info.passthrough && |
| !ShapeUtil::GetSubshape(output_shape_, index).IsTuple()) { |
| // The guard is above is not to insert copy-protection when aliasing |
| // pass-through params, as we do not need to write into the output |
| // buffer. |
| VLOG(3) << "Using copy-protection: aliasing is specified, but the " |
| "buffer is not donated; allocating a fresh buffer"; |
| int64_t allocation_size = |
| ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(output_shape_, index)); |
| StatusOr<se::OwningDeviceMemory> allocated_buffer = |
| memory_allocator->Allocate(device_ordinal, allocation_size); |
| if (!allocated_buffer.ok()) { |
| return ResourceExhausted("%s\n%s\n", |
| allocated_buffer.status().error_message(), |
| verbose_buffer_assignment_string_dumper_()); |
| } |
| result_buffer = allocated_buffer->Release(); |
| se::DeviceMemoryBase& aliased_buffer = |
| buffer_allocations.GetMutableDeviceAddress( |
| output_info.allocation_index); |
| CHECK_EQ(aliased_buffer.size(), result_buffer.size()); |
| run_options->stream()->ThenMemcpyD2D(&result_buffer, aliased_buffer, |
| aliased_buffer.size()); |
| aliased_buffer = result_buffer; |
| } |
| } |
| |
| if (result_buffer.is_null()) { |
| // The source instruction should have a non-parameter buffer |
| // assigned. |
| result_buffer = |
| buffer_allocations.GetDeviceAddress(output_info.allocation_index); |
| |
| // If the entire tuple contents is aliased, the copy insertion will *not* |
| // materialize a new tuple, so we mark it as aliased as well. |
| if (is_entire_tuple_contents_aliased) { |
| result.AddAliasedIndex(index); |
| } |
| } |
| buffers_in_result.insert(result_buffer); |
| } |
| |
| TF_RETURN_IF_ERROR(ExecuteThunksOrBef(run_options, buffer_allocations, |
| block_host_until_done)); |
| |
| // Free all temporary allocations. |
| TF_RETURN_IF_ERROR( |
| buffer_allocations.TearDown(buffers_in_result, allocations_)); |
| |
| // Free allocations for arguments. |
| if (auto args = absl::get_if<absl::Span<ExecutionInput>>(&arguments)) { |
| MarkToBeReleasedArguments(*args, result); |
| } |
| return std::move(result); |
| } |
| |
| Status GpuExecutable::ExecuteThunksOrBef( |
| const ServiceExecutableRunOptions* run_options, |
| const BufferAllocations& buffer_allocations, bool block_host_until_done) { |
| TF_RETURN_IF_ERROR( |
| CheckCompatibilityWithServiceExecutableRunOptions(run_options)); |
| |
| if (thunks_) { |
| se::StreamExecutor* executor = run_options->stream()->parent(); |
| for (const std::unique_ptr<Thunk>& thunk : thunks_->TotalOrder()) { |
| TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor)); |
| } |
| return ExecuteThunks(module_name_, *thunks_, run_options, |
| buffer_allocations, block_host_until_done); |
| } |
| |
| #if XLA_ENABLE_XLIR |
| if (bef_executable_) { |
| return ExecuteBef(module_name_, bef_executable_, run_options, |
| buffer_allocations, allocations_.size(), |
| block_host_until_done); |
| } |
| |
| if (jitrt_executable_) { |
| return ExecuteJitRt(module_name_, jitrt_executable_, run_options, |
| buffer_allocations, allocations_.size(), |
| block_host_until_done); |
| } |
| #endif // XLA_ENABLE_XLIR |
| |
| return FailedPrecondition("Expected XLA gpu executable is not supplied."); |
| } |
| |
| int64_t GpuExecutable::SizeOfGeneratedCodeInBytes() const { |
| // Non-empty PTX but empty cubin: compilation must have failed, return |
| // "unknown". |
| if (binary().empty() && !text_.empty()) { |
| return -1; |
| } |
| int64_t size = binary().size(); |
| for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) { |
| const BufferAllocation& allocation = allocations_[i]; |
| if (allocation.is_constant()) { |
| size += allocation.size(); |
| } |
| } |
| return size; |
| } |
| |
| Status GpuExecutable::SetUpMlirAllocation( |
| mlir::func::FuncOp func, llvm::ArrayRef<int64_t> buffer_sizes, |
| std::vector<BufferAllocation>* allocations, |
| absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>* output_info, |
| Shape* output_shape, int buffer_param_offset) { |
| for (int i = 0; i < buffer_sizes.size(); i++) { |
| allocations->emplace_back(i, buffer_sizes[i], 0); |
| } |
| |
| for (int i = 0; i < func.getNumArguments(); i++) { |
| if (i < buffer_param_offset) { |
| continue; |
| } |
| const int buffer_index = i - buffer_param_offset; |
| |
| if (auto param_attr = func.getArgAttr(i, "lmhlo.params")) { |
| xla::ShapeIndex shape_index; |
| if (auto shape_index_attr = |
| func.getArgAttrOfType<mlir::DenseIntElementsAttr>( |
| i, "lmhlo.param_shape_index")) { |
| for (const llvm::APInt& element : shape_index_attr) { |
| shape_index.push_back(element.getSExtValue()); |
| } |
| } |
| allocations->at(buffer_index) |
| .set_entry_computation_parameter( |
| param_attr.cast<mlir::IntegerAttr>().getInt(), shape_index, |
| static_cast<bool>(func.getArgAttr(i, "lmhlo.output_index"))); |
| } |
| // TODO(timshen): this information is redundant. This is here only for |
| // smooth migration to LMHLO. Remove it. |
| if (func.getArgAttr(i, "lmhlo.constant_name")) { |
| allocations->at(buffer_index).set_constant(true); |
| } |
| if (auto output_index_attr = func.getArgAttr(i, "lmhlo.output_index")) { |
| allocations->at(buffer_index).set_maybe_live_out(true); |
| |
| // Reconstruct a shape index from output_index. |
| ShapeIndex shape_index; |
| for (const llvm::APInt& element : |
| output_index_attr.cast<mlir::DenseIntElementsAttr>()) { |
| shape_index.push_back(element.getSExtValue()); |
| } |
| auto& o = (*output_info)[shape_index]; |
| o.allocation_index = buffer_index; |
| if (auto param_attr = func.getArgAttr(i, "lmhlo.params")) { |
| HloInputOutputAliasConfig::AliasKind kind = |
| HloInputOutputAliasConfig::kMayAlias; |
| if (func.getArgAttr(i, "lmhlo.must_alias")) { |
| kind = HloInputOutputAliasConfig::kMustAlias; |
| } |
| o.alias_config.emplace(param_attr.cast<mlir::IntegerAttr>().getInt(), |
| ShapeIndex{}, kind); |
| } |
| if (func.getArgument(i).use_empty()) { |
| o.passthrough = true; |
| } |
| } |
| } |
| // Expects result_xla_shape as a XLA shape in string form. |
| // |
| // The attribute is necessary, because GpuExecutable/ExecutionOutput supports |
| // tuples / tree-like shapes, while the LMHLO argument list loses the tree |
| // form. |
| // |
| // The string format is necessary since MLIR doesn't support XLA shape with |
| // dynamic_dimension. |
| // |
| // TODO(timshen): now this field is mandatory. Make it optional for |
| // non-GpuExecutable outputs. |
| TF_ASSIGN_OR_RETURN( |
| *output_shape, |
| ParseShape(func->getAttrOfType<mlir::StringAttr>("result_xla_shape") |
| .getValue() |
| .str())); |
| |
| return Status::OK(); |
| } |
| |
| #if XLA_ENABLE_XLIR |
| static void ApplyEntryFunctionAttributes( |
| mlir::MLIRContext& context, mlir::func::FuncOp& func, |
| xla::EntryFunctionAttributes entry_func_attrs, int buffer_param_offset) { |
| mlir::OpBuilder builder(&context); |
| llvm::SmallVector<mlir::DictionaryAttr, 8> args_attrs; |
| for (int i = 0; i < func.getNumArguments(); i++) { |
| mlir::NamedAttrList arg_attr_list; |
| if (i < buffer_param_offset) { |
| args_attrs.push_back(arg_attr_list.getDictionary(&context)); |
| continue; |
| } |
| const auto& buffer = entry_func_attrs.buffers(i - buffer_param_offset); |
| |
| if (buffer.lmhlo_params_present()) { |
| arg_attr_list.set("lmhlo.params", |
| builder.getIndexAttr(buffer.lmhlo_params())); |
| } |
| if (buffer.has_lmhlo_param_shape_index()) { |
| arg_attr_list.set( |
| "lmhlo.param_shape_index", |
| builder.getI64TensorAttr(llvm::makeArrayRef( |
| buffer.lmhlo_param_shape_index().indices().data(), |
| buffer.lmhlo_param_shape_index().indices().size()))); |
| } |
| if (!buffer.lmhlo_constant_name().empty()) { |
| arg_attr_list.set("lmhlo.constant_name", |
| builder.getStringAttr(buffer.lmhlo_constant_name())); |
| } |
| if (buffer.lmhlo_must_alias()) { |
| arg_attr_list.set("lmhlo.must_alias", builder.getUnitAttr()); |
| } |
| if (buffer.has_lmhlo_output_index()) { |
| arg_attr_list.set("lmhlo.output_index", |
| builder.getI64TensorAttr(llvm::makeArrayRef( |
| buffer.lmhlo_output_index().indices().data(), |
| buffer.lmhlo_output_index().indices().size()))); |
| } |
| args_attrs.push_back(arg_attr_list.getDictionary(&context)); |
| } |
| func.setAllArgAttrs(args_attrs); |
| func->setAttr("result_xla_shape", |
| builder.getStringAttr(entry_func_attrs.result_xla_shape())); |
| } |
| #endif // XLA_ENABLE_XLIR |
| |
| StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromBef( |
| std::shared_ptr<HloModule> hlo_module, absl::string_view bef, |
| xla::EntryFunctionAttributes entry_func_attrs, GpuVersion gpu_version, |
| se::StreamExecutor* executor) { |
| #if XLA_ENABLE_XLIR |
| OwnedBefBuffer bef_buffer = [bef]() { |
| auto ptr = static_cast<uint8_t*>( |
| tfrt::AlignedAlloc(tfrt::GetRequiredBefAlignment(), bef.size())); |
| std::copy(bef.begin(), bef.end(), ptr); |
| return OwnedBefBuffer(ptr, {bef.size()}); |
| }(); |
| |
| mlir::MLIRContext context; |
| mlir::DialectRegistry registry; |
| tfrt::RegisterTFRTDialects(registry); |
| tfrt::RegisterTFRTCompiledDialects(registry); |
| registry.insert<tfrt::gpu::GpuDialect>(); |
| registry.insert<XlirDialect>(); |
| context.appendDialectRegistry(registry); |
| for (const auto& dialect_name : context.getAvailableDialects()) { |
| context.getOrLoadDialect(dialect_name); |
| } |
| context.allowUnregisteredDialects(); |
| mlir::Location location = mlir::UnknownLoc::get(&context); |
| llvm::ArrayRef<uint8_t> bef_array(bef_buffer.get(), |
| bef_buffer.get_deleter().size); |
| auto module = tfrt::ConvertBEFToMLIR(location, bef_array, &context); |
| TF_ASSIGN_OR_RETURN( |
| OwnedGpuContextCache gpu_ctx_cache, |
| GpuExecutable::CreatePreloadedGpuContextCache(bef_array, executor)); |
| TF_ASSIGN_OR_RETURN( |
| BefExecutable * bef_executable, |
| BefExecutable::Create(std::move(bef_buffer), std::move(gpu_ctx_cache))); |
| auto func = mlir::cast<mlir::func::FuncOp>( |
| module->lookupSymbol(bef_executable->entry_point.function_name)); |
| // In tfrt_gpu dialect, buffer arguments start from the third parameter (after |
| // tfrt::Chain and GpuStream). |
| int buffer_param_offset = 2; |
| ApplyEntryFunctionAttributes(context, func, entry_func_attrs, |
| buffer_param_offset); |
| |
| std::vector<BufferAllocation> allocations; |
| absl::flat_hash_map<ShapeIndex, OutputInfo> output_info; |
| Shape result_xla_shape; |
| TF_RETURN_IF_ERROR(SetUpMlirAllocation( |
| func, bef_executable->entry_point.buffer_sizes, &allocations, |
| &output_info, &result_xla_shape, buffer_param_offset)); |
| |
| std::unique_ptr<Executable> executable; |
| std::string module_name = mlir::GetNameFromLoc(module->getLoc()); |
| // Calling private constructor. |
| executable = absl::WrapUnique( |
| new GpuExecutable(std::move(hlo_module), gpu_version, entry_func_attrs, |
| module_name, result_xla_shape, std::move(allocations), |
| std::move(output_info), bef_executable)); |
| return executable; |
| #else // XLA_ENABLE_XLIR |
| return FailedPrecondition("Not built with XLA_ENABLE_XLIR"); |
| #endif // XLA_ENABLE_XLIR |
| } |
| |
| StatusOr<GpuExecutable::OwnedGpuContextCache> |
| GpuExecutable::CreatePreloadedGpuContextCache(llvm::ArrayRef<uint8_t> bef_array, |
| se::StreamExecutor* executor) { |
| #if XLA_ENABLE_XLIR |
| mlir::MLIRContext context; |
| std::unique_ptr<tfrt::HostContext> host_ctx = |
| tfrt::gpu::CreateHostContext(tfrt::gpu::GetDiagHandler(&context)); |
| |
| auto bef_file = |
| tfrt::BEFFile::Open(bef_array, host_ctx->GetKernelRegistry(), |
| host_ctx->diag_handler(), host_ctx->allocator()); |
| if (!bef_file) { |
| return InternalError("Failed to decode BEF buffer"); |
| } |
| |
| auto gpu_executor = |
| tensorflow::down_cast<se::gpu::GpuExecutor*>(executor->implementation()); |
| auto gpu_context = |
| se::gpu::GpuDriver::GetContextHandle(gpu_executor->gpu_context()); |
| |
| auto gpu_ctx_cache = OwnedGpuContextCache(new tfrt::gpu::GpuContextCache); |
| auto context_and_resource = gpu_ctx_cache->GetOrCreate(gpu_context); |
| auto exec_ctx = tfrt::gpu::CreateExecutionContext( |
| host_ctx.get(), context_and_resource.second); |
| |
| if (auto error = tfrt::gpu::PreloadGpuResources( |
| *bef_file, *exec_ctx, context_and_resource.first.CopyRef())) { |
| return tensorflow::errors::Internal(llvm::toString(std::move(error))); |
| } |
| return gpu_ctx_cache; |
| #else // XLA_ENABLE_XLIR |
| return FailedPrecondition("Not built with XLA_ENABLE_XLIR"); |
| #endif // XLA_ENABLE_XLIR |
| } |
| |
| StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>> |
| GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) { |
| const HloInstruction* root = |
| hlo_module.entry_computation()->root_instruction(); |
| |
| InstructionValueSet root_value_set = |
| assignment.dataflow_analysis().GetInstructionValueSet(root); |
| |
| if (root_value_set.IsAmbiguous()) { |
| return Unimplemented("Points-to set of root instruction is ambiguous"); |
| } |
| |
| using OutputInfoMap = |
| absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>; |
| OutputInfoMap output; |
| TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus( |
| root->shape(), |
| [&](const Shape& /*sub_shape*/, const ShapeIndex& index) -> Status { |
| const auto& sources = root_value_set.element(index); |
| // The points-to set is unambiguous so the set should be a |
| // singleton. That is, we know exactly which instruction |
| // produced the array at this element. |
| CHECK_EQ(1, sources.values().size()); |
| HloInstruction* src_hlo = sources.values()[0]->instruction(); |
| |
| GpuExecutable::OutputInfo& info = output[index]; |
| info.passthrough = src_hlo->opcode() == HloOpcode::kParameter; |
| TF_ASSIGN_OR_RETURN( |
| const BufferAllocation::Slice slice, |
| assignment.GetUniqueSlice(src_hlo, sources.values()[0]->index())); |
| CHECK_EQ(slice.offset(), 0) << "Parameter should get its own slice"; |
| info.allocation_index = slice.index(); |
| |
| output[index].alias_config = |
| hlo_module.input_output_alias_config().GetAliasedParameter(index); |
| |
| return Status::OK(); |
| })); |
| return output; |
| } |
| |
| } // namespace gpu |
| } // namespace xla |