tensorflow/stream_executor/stream_executor_internal.h - platform/external/tensorflow - Git at Google

 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 // Interfaces for platform-dependent implementations to satisfy. This are
 // delegated to from the StreamExecutor in pointer-to-implementation style; i.e.
 // the StreamExecutor is just a husk that delegates calls to the
 // platform-specific objects which implement the interfaces defined here.

 #ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_

 #include <functional>
 #include <map>
 #include <memory>
 #include <utility>
 #include <vector>

 #include "absl/types/optional.h"
 #include "tensorflow/stream_executor/allocator_stats.h"
 #include "tensorflow/stream_executor/device_description.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/kernel.h"
 #include "tensorflow/stream_executor/kernel_cache_config.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/module_spec.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/trace_listener.h"

 namespace stream_executor {

 class Stream;
 class Timer;

 // An opaque handle to a loaded module.
 //
 // An instance of this is returned from StreamExecutor::GetModule.
 class ModuleHandle {
  public:
   /*implicit*/ ModuleHandle(void *id = nullptr) : id_(id) {}

   // A ModuleHandle with id() == nullptr is an invalid module handle, akin to a
   // null pointer.
   void *id() const { return id_; }

   explicit operator bool() const { return id() != nullptr; }

  private:
   void *id_;
 };

 namespace internal {

 // Platform-dependent interface class for the generic Events interface, in
 // the PIMPL style.
 class EventInterface {
  public:
   EventInterface() {}
   virtual ~EventInterface() {}

  private:
   SE_DISALLOW_COPY_AND_ASSIGN(EventInterface);
 };

 // Pointer-to-implementation object type (i.e. the KernelBase class delegates to
 // this interface) with virtual destruction. This class exists for the
 // platform-dependent code to hang any kernel data/resource info/functionality
 // off of.
 class KernelInterface {
  public:
   // Default constructor for the abstract interface.
   KernelInterface() {}

   // Default destructor for the abstract interface.
   virtual ~KernelInterface() {}

   // Returns the number of formal parameters that this kernel accepts.
   virtual unsigned Arity() const = 0;

   // Sets the preferred cache configuration.
   virtual void SetPreferredCacheConfig(KernelCacheConfig config) = 0;

   // Gets the preferred cache configuration.
   virtual KernelCacheConfig GetPreferredCacheConfig() const = 0;

  private:
   SE_DISALLOW_COPY_AND_ASSIGN(KernelInterface);
 };

 // Pointer-to-implementation object type (i.e. the Stream class delegates to
 // this interface) with virtual destruction. This class exists for the
 // platform-dependent code to hang any kernel data/resource info/functionality
 // off of.
 class StreamInterface {
  public:
   // Default constructor for the abstract interface.
   StreamInterface() {}

   // Default destructor for the abstract interface.
   virtual ~StreamInterface() {}

   // Returns the GPU stream associated with this platform's stream
   // implementation.
   //
   // WARNING: checks that the underlying platform is, in fact, CUDA or ROCm,
   // causing a fatal error if it is not. This hack is made available solely for
   // use from distbelief code, which temporarily has strong ties to CUDA or
   // ROCm as a platform.
   virtual void *GpuStreamHack() { return nullptr; }

   // See the above comment on GpuStreamHack -- this further breaks abstraction
   // for Eigen within distbelief, which has strong ties to CUDA or ROCm as a
   // platform, and a historical attachment to a programming model which takes a
   // stream-slot rather than a stream-value.
   virtual void **GpuStreamMemberHack() { return nullptr; }

  private:
   SE_DISALLOW_COPY_AND_ASSIGN(StreamInterface);
 };

 // Pointer-to-implementation object type (i.e. the Timer class delegates to
 // this interface) with virtual destruction. This class exists for the
 // platform-dependent code to hang any timer data/resource info/functionality
 // off of.
 class TimerInterface {
  public:
   // Default constructor for the abstract interface.
   TimerInterface() {}

   // Default destructor for the abstract interface.
   virtual ~TimerInterface() {}

   // Returns the number of microseconds elapsed in a completed timer.
   virtual uint64 Microseconds() const = 0;

   // Returns the number of nanoseconds elapsed in a completed timer.
   virtual uint64 Nanoseconds() const = 0;

  private:
   SE_DISALLOW_COPY_AND_ASSIGN(TimerInterface);
 };

 // Interface for the different StreamExecutor platforms (i.e. CUDA, OpenCL).
 //
 // Various platforms will provide an implementation that satisfy this interface.
 class StreamExecutorInterface {
  public:
   // Default constructor for the abstract interface.
   StreamExecutorInterface() {}

   // Default destructor for the abstract interface.
   virtual ~StreamExecutorInterface() {}

   // Returns the (transitively) wrapped executor if this executor is
   // wrapping another executor; otherwise, returns this.
   virtual StreamExecutorInterface *GetUnderlyingExecutor() { return this; }

   // See the StreamExecutor interface for comments on the same-named methods.
   virtual port::Status Init(int device_ordinal,
                             DeviceOptions device_options) = 0;

   virtual port::Status GetKernel(const MultiKernelLoaderSpec &spec,
                                  KernelBase *kernel) {
     return port::UnimplementedError("Not Implemented");
   }
   virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
   virtual port::Status LoadModule(const MultiModuleLoaderSpec &spec,
                                   ModuleHandle *module_handle) {
     return port::UnimplementedError("Not Implemented");
   }
   virtual port::Status Launch(Stream *stream, const ThreadDim &thread_dims,
                               const BlockDim &block_dims, const KernelBase &k,
                               const KernelArgsArrayBase &args) {
     return port::UnimplementedError("Not Implemented");
   }

   // Releases any state associated with the kernel.
   virtual void UnloadKernel(const KernelBase *kernel) {}
   virtual DeviceMemoryBase Allocate(uint64 size, int64 memory_space) = 0;
   DeviceMemoryBase Allocate(uint64 size) {
     return Allocate(size, /*memory_space=*/0);
   }
   virtual void *GetSubBuffer(DeviceMemoryBase *parent, uint64 offset,
                              uint64 size) = 0;
   virtual void Deallocate(DeviceMemoryBase *mem) = 0;
   // Allocates unified memory space of the given size, if supported.
   // See
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
   // for more details on unified memory.
   virtual void *UnifiedMemoryAllocate(uint64 size) { return nullptr; }

   // Deallocates unified memory space previously allocated with
   // UnifiedMemoryAllocate.
   virtual void UnifiedMemoryDeallocate(void *mem) {}
   virtual void *HostMemoryAllocate(uint64 size) = 0;
   virtual void HostMemoryDeallocate(void *mem) = 0;
   virtual bool HostMemoryRegister(void *mem, uint64 size) = 0;
   virtual bool HostMemoryUnregister(void *mem) = 0;
   virtual bool SynchronizeAllActivity() = 0;
   virtual port::Status SynchronousMemZero(DeviceMemoryBase *location,
                                           uint64 size) = 0;
   virtual port::Status SynchronousMemSet(DeviceMemoryBase *location, int value,
                                          uint64 size) = 0;
   virtual port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
                                          const void *host_src, uint64 size) = 0;
   virtual port::Status SynchronousMemcpy(void *host_dst,
                                          const DeviceMemoryBase &gpu_src,
                                          uint64 size) = 0;
   virtual port::Status SynchronousMemcpyDeviceToDevice(
       DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src,
       uint64 size) = 0;
   virtual port::Status MemZero(Stream *stream, DeviceMemoryBase *location,
                                uint64 size) = 0;
   virtual port::Status Memset(Stream *stream, DeviceMemoryBase *location,
                               uint8 pattern, uint64 size) {
     return port::InternalError("Not implemented");
   }
   virtual port::Status Memset32(Stream *stream, DeviceMemoryBase *location,
                                 uint32 pattern, uint64 size) = 0;
   virtual bool Memcpy(Stream *stream, void *host_dst,
                       const DeviceMemoryBase &gpu_src, uint64 size) = 0;
   virtual bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
                       const void *host_src, uint64 size) = 0;
   virtual bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
                                     const DeviceMemoryBase &gpu_src,
                                     uint64 size) = 0;
   virtual bool HostCallback(Stream *stream, std::function<void()> callback);
   virtual bool HostCallback(Stream *stream,
                             std::function<port::Status()> callback) = 0;
   virtual port::Status AllocateEvent(Event *event) = 0;
   virtual port::Status DeallocateEvent(Event *event) = 0;
   virtual port::Status RecordEvent(Stream *stream, Event *event) = 0;
   virtual port::Status WaitForEvent(Stream *stream, Event *event) = 0;
   virtual Event::Status PollForEventStatus(Event *event) = 0;
   virtual bool AllocateStream(Stream *stream) = 0;
   virtual void DeallocateStream(Stream *stream) = 0;
   virtual bool CreateStreamDependency(Stream *dependent, Stream *other) = 0;
   virtual bool AllocateTimer(Timer *timer) = 0;
   virtual void DeallocateTimer(Timer *timer) = 0;
   virtual bool StartTimer(Stream *stream, Timer *timer) = 0;
   virtual bool StopTimer(Stream *stream, Timer *timer) = 0;
   virtual port::Status BlockHostUntilDone(Stream *stream) = 0;
   virtual port::Status GetStatus(Stream *stream) {
     return port::Status(port::error::UNIMPLEMENTED,
                         "GetStatus is not supported on this executor.");
   }
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual SharedMemoryConfig GetDeviceSharedMemoryConfig() = 0;
   virtual port::Status SetDeviceSharedMemoryConfig(
       SharedMemoryConfig config) = 0;

   virtual int64 GetDeviceLoad() { return -1; }

   virtual bool DeviceMemoryUsage(int64 *free, int64 *total) const {
     return false;
   }

   // Retrieves device pointer and size for a symbol. The device pointer is
   // stored at mem, and the size is stored at size. Either mem or bytes can be
   // null, however, both of them cannot be null at the same time. To use
   // constant memory in CUDA, GetSymbol has to be used. Returns true if symbol
   // is found.
   //
   // If ModuleHandle is set then we search for `symbol_name` only within the
   // module corresponding to `module_handle`.  Otherwise all loaded modules are
   // searched.
   virtual bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
                          void **mem, size_t *bytes) {
     return false;
   }

   // Creates a new DeviceDescription object. Ownership is transferred to the
   // caller.
   virtual port::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription() const = 0;

   // Attempts to register the provided TraceListener with the device-specific
   // Executor implementation. When this is called, the PIMPL interface has
   // already taken ownership of the object and is managing the generic tracing
   // events. The device-specific implementation must determine if the passed
   // listener is of a type appropriate for it to trace during registration (and
   // before dispatching events to it).
   // Returns true if the listener was successfully registered, false otherwise.
   // Does not take ownership of listener.
   virtual bool RegisterTraceListener(TraceListener *listener) { return false; }

   // Unregisters the specified listener from the device-specific Executor.
   // Returns true if the listener was successfully registered, false otherwise.
   virtual bool UnregisterTraceListener(TraceListener *listener) {
     return false;
   }

   // Returns whether this StreamExecutor has BLAS support for its underlying
   // platform.
   virtual bool SupportsBlas() const { return false; }

   // Creates a new BlasSupport object, ownership is transferred to the caller.
   // If SupportsBlas() is false, this will always return null.
   //
   // If SupportsBlas() is true, this may return null, for example, if the BLAS
   // initialization fails.
   virtual blas::BlasSupport *CreateBlas() { return nullptr; }

   // Returns whether this StreamExecutor has FFT support for its underlying
   // platform.
   virtual bool SupportsFft() const { return false; }

   // Creates a new fft::FftSupport object, ownership is transferred to the
   // caller.
   // If SupportsFft() is false, this will always return null.
   //
   // If SupportsFft() is true, this may return null, for example, if the FFT
   // initialization fails.
   virtual fft::FftSupport *CreateFft() { return nullptr; }

   // Returns whether this StreamExecutor has Random Number Generation support
   // for
   // its underlying platform.
   virtual bool SupportsRng() const { return false; }

   // Returns whether this StreamExecutor has neural net support for its
   // underlying
   // platform.
   virtual bool SupportsDnn() const { return false; }

   // Creates a new RngSupport object, ownership is transferred to the caller.
   // If SupportsRng() is false, this will always return null.
   //
   // If SupportsRng() is true, this may return null, for example, if the RNG
   // initialization fails.
   virtual rng::RngSupport *CreateRng() { return nullptr; }

   // Creates a new DnnSupport object, ownership is transferred to the caller.
   // If SupportsDnn() is false, this will always return null.
   //
   // If SupportsDnn() is true, this may return null, for example, if the DNN
   // initialization fails.
   virtual dnn::DnnSupport *CreateDnn() { return nullptr; }

   // Each call creates a new instance of the platform-specific implementation of
   // the corresponding interface type.
   virtual std::unique_ptr<EventInterface> CreateEventImplementation() = 0;
   virtual std::unique_ptr<KernelInterface> CreateKernelImplementation() = 0;
   virtual std::unique_ptr<StreamInterface> GetStreamImplementation() = 0;
   virtual std::unique_ptr<TimerInterface> GetTimerImplementation() = 0;

   // Returns the CUDA or ROCm context associated with this StreamExecutor
   // platform implementation.
   //
   // WARNING: checks that the underlying platform is, in fact, CUDA or ROCm,
   // causing a fatal error if it is not. This hack is made available solely for
   // use from distbelief code, which temporarily has strong ties to CUDA or ROCm
   // as a platform.
   virtual void *GpuContextHack() { return nullptr; }

   // Return allocator statistics.
   virtual absl::optional<AllocatorStats> GetAllocatorStats() {
     return absl::nullopt;
   }

   // Clears the compilation cache from volatile memory. Returns OK if no
   // compilation cache exists or if clearing the compilation cache is
   // unsupported. Caches in non-volatile storage are unaffected.
   virtual port::Status FlushCompilationCache() { return port::Status::OK(); }

  private:
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
 };

 }  // namespace internal
 }  // namespace stream_executor

 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
	/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	// Interfaces for platform-dependent implementations to satisfy. This are
	// delegated to from the StreamExecutor in pointer-to-implementation style; i.e.
	// the StreamExecutor is just a husk that delegates calls to the
	// platform-specific objects which implement the interfaces defined here.

	#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
	#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_

	#include <functional>
	#include <map>
	#include <memory>
	#include <utility>
	#include <vector>

	#include "absl/types/optional.h"
	#include "tensorflow/stream_executor/allocator_stats.h"
	#include "tensorflow/stream_executor/device_description.h"
	#include "tensorflow/stream_executor/device_memory.h"
	#include "tensorflow/stream_executor/device_options.h"
	#include "tensorflow/stream_executor/dnn.h"
	#include "tensorflow/stream_executor/event.h"
	#include "tensorflow/stream_executor/kernel.h"
	#include "tensorflow/stream_executor/kernel_cache_config.h"
	#include "tensorflow/stream_executor/kernel_spec.h"
	#include "tensorflow/stream_executor/launch_dim.h"
	#include "tensorflow/stream_executor/lib/status.h"
	#include "tensorflow/stream_executor/lib/statusor.h"
	#include "tensorflow/stream_executor/module_spec.h"
	#include "tensorflow/stream_executor/platform.h"
	#include "tensorflow/stream_executor/platform/port.h"
	#include "tensorflow/stream_executor/plugin_registry.h"
	#include "tensorflow/stream_executor/shared_memory_config.h"
	#include "tensorflow/stream_executor/trace_listener.h"

	namespace stream_executor {

	class Stream;
	class Timer;

	// An opaque handle to a loaded module.
	//
	// An instance of this is returned from StreamExecutor::GetModule.
	class ModuleHandle {
	public:
	/implicit/ ModuleHandle(void *id = nullptr) : id_(id) {}

	// A ModuleHandle with id() == nullptr is an invalid module handle, akin to a
	// null pointer.
	void *id() const { return id_; }

	explicit operator bool() const { return id() != nullptr; }

	private:
	void *id_;
	};

	namespace internal {

	// Platform-dependent interface class for the generic Events interface, in
	// the PIMPL style.
	class EventInterface {
	public:
	EventInterface() {}
	virtual ~EventInterface() {}

	private:
	SE_DISALLOW_COPY_AND_ASSIGN(EventInterface);
	};

	// Pointer-to-implementation object type (i.e. the KernelBase class delegates to
	// this interface) with virtual destruction. This class exists for the
	// platform-dependent code to hang any kernel data/resource info/functionality
	// off of.
	class KernelInterface {
	public:
	// Default constructor for the abstract interface.
	KernelInterface() {}

	// Default destructor for the abstract interface.
	virtual ~KernelInterface() {}

	// Returns the number of formal parameters that this kernel accepts.
	virtual unsigned Arity() const = 0;

	// Sets the preferred cache configuration.
	virtual void SetPreferredCacheConfig(KernelCacheConfig config) = 0;

	// Gets the preferred cache configuration.
	virtual KernelCacheConfig GetPreferredCacheConfig() const = 0;

	private:
	SE_DISALLOW_COPY_AND_ASSIGN(KernelInterface);
	};

	// Pointer-to-implementation object type (i.e. the Stream class delegates to
	// this interface) with virtual destruction. This class exists for the
	// platform-dependent code to hang any kernel data/resource info/functionality
	// off of.
	class StreamInterface {
	public:
	// Default constructor for the abstract interface.
	StreamInterface() {}

	// Default destructor for the abstract interface.
	virtual ~StreamInterface() {}

	// Returns the GPU stream associated with this platform's stream
	// implementation.
	//
	// WARNING: checks that the underlying platform is, in fact, CUDA or ROCm,
	// causing a fatal error if it is not. This hack is made available solely for
	// use from distbelief code, which temporarily has strong ties to CUDA or
	// ROCm as a platform.
	virtual void *GpuStreamHack() { return nullptr; }

	// See the above comment on GpuStreamHack -- this further breaks abstraction
	// for Eigen within distbelief, which has strong ties to CUDA or ROCm as a
	// platform, and a historical attachment to a programming model which takes a
	// stream-slot rather than a stream-value.
	virtual void **GpuStreamMemberHack() { return nullptr; }

	private:
	SE_DISALLOW_COPY_AND_ASSIGN(StreamInterface);
	};

	// Pointer-to-implementation object type (i.e. the Timer class delegates to
	// this interface) with virtual destruction. This class exists for the
	// platform-dependent code to hang any timer data/resource info/functionality
	// off of.
	class TimerInterface {
	public:
	// Default constructor for the abstract interface.
	TimerInterface() {}

	// Default destructor for the abstract interface.
	virtual ~TimerInterface() {}

	// Returns the number of microseconds elapsed in a completed timer.
	virtual uint64 Microseconds() const = 0;

	// Returns the number of nanoseconds elapsed in a completed timer.
	virtual uint64 Nanoseconds() const = 0;

	private:
	SE_DISALLOW_COPY_AND_ASSIGN(TimerInterface);
	};

	// Interface for the different StreamExecutor platforms (i.e. CUDA, OpenCL).
	//
	// Various platforms will provide an implementation that satisfy this interface.
	class StreamExecutorInterface {
	public:
	// Default constructor for the abstract interface.
	StreamExecutorInterface() {}

	// Default destructor for the abstract interface.
	virtual ~StreamExecutorInterface() {}

	// Returns the (transitively) wrapped executor if this executor is
	// wrapping another executor; otherwise, returns this.
	virtual StreamExecutorInterface *GetUnderlyingExecutor() { return this; }

	// See the StreamExecutor interface for comments on the same-named methods.
	virtual port::Status Init(int device_ordinal,
	DeviceOptions device_options) = 0;

	virtual port::Status GetKernel(const MultiKernelLoaderSpec &spec,
	KernelBase *kernel) {
	return port::UnimplementedError("Not Implemented");
	}
	virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
	virtual port::Status LoadModule(const MultiModuleLoaderSpec &spec,
	ModuleHandle *module_handle) {
	return port::UnimplementedError("Not Implemented");
	}
	virtual port::Status Launch(Stream *stream, const ThreadDim &thread_dims,
	const BlockDim &block_dims, const KernelBase &k,
	const KernelArgsArrayBase &args) {
	return port::UnimplementedError("Not Implemented");
	}

	// Releases any state associated with the kernel.
	virtual void UnloadKernel(const KernelBase *kernel) {}
	virtual DeviceMemoryBase Allocate(uint64 size, int64 memory_space) = 0;
	DeviceMemoryBase Allocate(uint64 size) {
	return Allocate(size, /memory_space=/0);
	}
	virtual void GetSubBuffer(DeviceMemoryBase parent, uint64 offset,
	uint64 size) = 0;
	virtual void Deallocate(DeviceMemoryBase *mem) = 0;
	// Allocates unified memory space of the given size, if supported.
	// See
	// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
	// for more details on unified memory.
	virtual void *UnifiedMemoryAllocate(uint64 size) { return nullptr; }

	// Deallocates unified memory space previously allocated with
	// UnifiedMemoryAllocate.
	virtual void UnifiedMemoryDeallocate(void *mem) {}
	virtual void *HostMemoryAllocate(uint64 size) = 0;
	virtual void HostMemoryDeallocate(void *mem) = 0;
	virtual bool HostMemoryRegister(void *mem, uint64 size) = 0;
	virtual bool HostMemoryUnregister(void *mem) = 0;
	virtual bool SynchronizeAllActivity() = 0;
	virtual port::Status SynchronousMemZero(DeviceMemoryBase *location,
	uint64 size) = 0;
	virtual port::Status SynchronousMemSet(DeviceMemoryBase *location, int value,
	uint64 size) = 0;
	virtual port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
	const void *host_src, uint64 size) = 0;
	virtual port::Status SynchronousMemcpy(void *host_dst,
	const DeviceMemoryBase &gpu_src,
	uint64 size) = 0;
	virtual port::Status SynchronousMemcpyDeviceToDevice(
	DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src,
	uint64 size) = 0;
	virtual port::Status MemZero(Stream stream, DeviceMemoryBase location,
	uint64 size) = 0;
	virtual port::Status Memset(Stream stream, DeviceMemoryBase location,
	uint8 pattern, uint64 size) {
	return port::InternalError("Not implemented");
	}
	virtual port::Status Memset32(Stream stream, DeviceMemoryBase location,
	uint32 pattern, uint64 size) = 0;
	virtual bool Memcpy(Stream stream, void host_dst,
	const DeviceMemoryBase &gpu_src, uint64 size) = 0;
	virtual bool Memcpy(Stream stream, DeviceMemoryBase gpu_dst,
	const void *host_src, uint64 size) = 0;
	virtual bool MemcpyDeviceToDevice(Stream stream, DeviceMemoryBase gpu_dst,
	const DeviceMemoryBase &gpu_src,
	uint64 size) = 0;
	virtual bool HostCallback(Stream *stream, std::function<void()> callback);
	virtual bool HostCallback(Stream *stream,
	std::function<port::Status()> callback) = 0;
	virtual port::Status AllocateEvent(Event *event) = 0;
	virtual port::Status DeallocateEvent(Event *event) = 0;
	virtual port::Status RecordEvent(Stream stream, Event event) = 0;
	virtual port::Status WaitForEvent(Stream stream, Event event) = 0;
	virtual Event::Status PollForEventStatus(Event *event) = 0;
	virtual bool AllocateStream(Stream *stream) = 0;
	virtual void DeallocateStream(Stream *stream) = 0;
	virtual bool CreateStreamDependency(Stream dependent, Stream other) = 0;
	virtual bool AllocateTimer(Timer *timer) = 0;
	virtual void DeallocateTimer(Timer *timer) = 0;
	virtual bool StartTimer(Stream stream, Timer timer) = 0;
	virtual bool StopTimer(Stream stream, Timer timer) = 0;
	virtual port::Status BlockHostUntilDone(Stream *stream) = 0;
	virtual port::Status GetStatus(Stream *stream) {
	return port::Status(port::error::UNIMPLEMENTED,
	"GetStatus is not supported on this executor.");
	}
	virtual int PlatformDeviceCount() = 0;
	virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
	virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
	virtual SharedMemoryConfig GetDeviceSharedMemoryConfig() = 0;
	virtual port::Status SetDeviceSharedMemoryConfig(
	SharedMemoryConfig config) = 0;

	virtual int64 GetDeviceLoad() { return -1; }

	virtual bool DeviceMemoryUsage(int64 free, int64 total) const {
	return false;
	}

	// Retrieves device pointer and size for a symbol. The device pointer is
	// stored at mem, and the size is stored at size. Either mem or bytes can be
	// null, however, both of them cannot be null at the same time. To use
	// constant memory in CUDA, GetSymbol has to be used. Returns true if symbol
	// is found.
	//
	// If ModuleHandle is set then we search for `symbol_name` only within the
	// module corresponding to `module_handle`. Otherwise all loaded modules are
	// searched.
	virtual bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
	void *mem, size_t bytes) {
	return false;
	}

	// Creates a new DeviceDescription object. Ownership is transferred to the
	// caller.
	virtual port::StatusOr<std::unique_ptr<DeviceDescription>>
	CreateDeviceDescription() const = 0;

	// Attempts to register the provided TraceListener with the device-specific
	// Executor implementation. When this is called, the PIMPL interface has
	// already taken ownership of the object and is managing the generic tracing
	// events. The device-specific implementation must determine if the passed
	// listener is of a type appropriate for it to trace during registration (and
	// before dispatching events to it).
	// Returns true if the listener was successfully registered, false otherwise.
	// Does not take ownership of listener.
	virtual bool RegisterTraceListener(TraceListener *listener) { return false; }

	// Unregisters the specified listener from the device-specific Executor.
	// Returns true if the listener was successfully registered, false otherwise.
	virtual bool UnregisterTraceListener(TraceListener *listener) {
	return false;
	}

	// Returns whether this StreamExecutor has BLAS support for its underlying
	// platform.
	virtual bool SupportsBlas() const { return false; }

	// Creates a new BlasSupport object, ownership is transferred to the caller.
	// If SupportsBlas() is false, this will always return null.
	//
	// If SupportsBlas() is true, this may return null, for example, if the BLAS
	// initialization fails.
	virtual blas::BlasSupport *CreateBlas() { return nullptr; }

	// Returns whether this StreamExecutor has FFT support for its underlying
	// platform.
	virtual bool SupportsFft() const { return false; }

	// Creates a new fft::FftSupport object, ownership is transferred to the
	// caller.
	// If SupportsFft() is false, this will always return null.
	//
	// If SupportsFft() is true, this may return null, for example, if the FFT
	// initialization fails.
	virtual fft::FftSupport *CreateFft() { return nullptr; }

	// Returns whether this StreamExecutor has Random Number Generation support
	// for
	// its underlying platform.
	virtual bool SupportsRng() const { return false; }

	// Returns whether this StreamExecutor has neural net support for its
	// underlying
	// platform.
	virtual bool SupportsDnn() const { return false; }

	// Creates a new RngSupport object, ownership is transferred to the caller.
	// If SupportsRng() is false, this will always return null.
	//
	// If SupportsRng() is true, this may return null, for example, if the RNG
	// initialization fails.
	virtual rng::RngSupport *CreateRng() { return nullptr; }

	// Creates a new DnnSupport object, ownership is transferred to the caller.
	// If SupportsDnn() is false, this will always return null.
	//
	// If SupportsDnn() is true, this may return null, for example, if the DNN
	// initialization fails.
	virtual dnn::DnnSupport *CreateDnn() { return nullptr; }

	// Each call creates a new instance of the platform-specific implementation of
	// the corresponding interface type.
	virtual std::unique_ptr<EventInterface> CreateEventImplementation() = 0;
	virtual std::unique_ptr<KernelInterface> CreateKernelImplementation() = 0;
	virtual std::unique_ptr<StreamInterface> GetStreamImplementation() = 0;
	virtual std::unique_ptr<TimerInterface> GetTimerImplementation() = 0;

	// Returns the CUDA or ROCm context associated with this StreamExecutor
	// platform implementation.
	//
	// WARNING: checks that the underlying platform is, in fact, CUDA or ROCm,
	// causing a fatal error if it is not. This hack is made available solely for
	// use from distbelief code, which temporarily has strong ties to CUDA or ROCm
	// as a platform.
	virtual void *GpuContextHack() { return nullptr; }

	// Return allocator statistics.
	virtual absl::optional<AllocatorStats> GetAllocatorStats() {
	return absl::nullopt;
	}

	// Clears the compilation cache from volatile memory. Returns OK if no
	// compilation cache exists or if clearing the compilation cache is
	// unsupported. Caches in non-volatile storage are unaffected.
	virtual port::Status FlushCompilationCache() { return port::Status::OK(); }

	private:
	SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
	};

	} // namespace internal
	} // namespace stream_executor

	#endif // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_