tensorflow/core/profiler/internal/gpu/cupti_tracer.h - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_

 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_set.h"
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"

 namespace tensorflow {
 namespace profiler {

 struct MemcpyDetails {
   // The amount of data copied for memcpy events.
   size_t num_bytes;
   // The destination device for peer-2-peer communication (memcpy). The source
   // device is implicit: its the current device.
   uint32 destination;
   // Whether or not the memcpy is asynchronous.
   bool async;
   // This contains CUpti_ActivityMemcpyKind for activity event (on device).
   // For events from other CuptiTracerEventSource, it is always 0.
   int8 kind;
 };

 struct MemAllocDetails {
   // The amount of data requested for cudaMalloc events.
   uint64 num_bytes;
 };

 struct KernelDetails {
   // The number of registers used in this kernel.
   uint64 registers_per_thread;
   // The amount of shared memory space used by a thread block.
   uint64 static_shared_memory_usage;
   // The amount of dynamic memory space used by a thread block.
   uint64 dynamic_shared_memory_usage;
   // X-dimension of a thread block.
   uint64 block_x;
   // Y-dimension of a thread block.
   uint64 block_y;
   // Z-dimension of a thread block.
   uint64 block_z;
   // X-dimension of a grid.
   uint64 grid_x;
   // Y-dimension of a grid.
   uint64 grid_y;
   // Z-dimension of a grid.
   uint64 grid_z;
 };

 enum class CuptiTracerEventType {
   Unsupported = 0,
   Kernel = 1,
   MemcpyH2D = 2,
   MemcpyD2H = 3,
   MemcpyD2D = 4,
   MemcpyP2P = 5,
   MemcpyOther = 6,
   MemoryAlloc = 7,
   Overhead = 8,
   UnifiedMemory = 9,
   Generic = 100,
 };

 const char* GetTraceEventTypeName(const CuptiTracerEventType& type);

 enum class CuptiTracerEventSource {
   DriverCallback = 0,
   Activity = 1,
   // Maybe consider adding runtime callback and metric api in the future.
 };

 struct CuptiTracerEvent {
   static constexpr uint32 kInvalidThreadId =
       std::numeric_limits<uint32_t>::max();
   static constexpr uint32 kInvalidCorrelationId =
       std::numeric_limits<uint32_t>::max();
   static constexpr uint64 kInvalidContextId =
       std::numeric_limits<uint64_t>::max();
   static constexpr uint64 kInvalidStreamId =
       std::numeric_limits<uint64_t>::max();
   CuptiTracerEventType type;
   CuptiTracerEventSource source;
   // Although CUpti_CallbackData::functionName is persistent, however
   // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
   // it.
   std::string name;
   // This points to strings in AnnotationMap, which should outlive the point
   // where serialization happens.
   absl::string_view annotation;
   uint64 start_time_ns;
   uint64 end_time_ns;
   uint32 device_id;
   uint32 correlation_id = kInvalidCorrelationId;
   uint32 thread_id = kInvalidThreadId;
   int64 context_id = kInvalidContextId;
   int64 stream_id = kInvalidStreamId;
   union {
     MemcpyDetails memcpy_info;      // If type == Memcpy*
     MemAllocDetails memalloc_info;  // If type == MemoryAlloc
     KernelDetails kernel_info;      // If type == Kernel
   };
 };

 struct CuptiTracerOptions {
   bool enable_activity_api = true;

   // Use cuda events to enclose the kernel/memcpy to measure device activity.
   // enable_event_based_activity, if true, will override the enable_activity_api
   // setting.
   bool enable_event_based_activity = false;

   bool required_callback_api_events = true;
   // Maximum number of annotation strings that we can accommodate.
   uint64 max_annotation_strings = 1024 * 1024;
   // The callback ids that will be enabled and monitored, if empty, all
   // Callback ids to be enabled using Callback API.
   // We only care CUPTI_CB_DOMAIN_DRIVER_API domain for now. It is kind of
   // redundant to have both CUPTI_CB_DOMAIN_DRIVER_API and
   // CUPTI_CB_DOMAIN_RUNTIME_API.
   std::vector<CUpti_driver_api_trace_cbid_enum> cbids_selected;
   // Activity kinds to be collected using Activity API. If empty, the Activity
   // API is disable.
   std::vector<CUpti_ActivityKind> activities_selected;
   // Whether to call cuptiFinalize.
   bool cupti_finalize = false;
 };

 struct CuptiTracerCollectorOptions {
   // Maximum number of events to collect from callback API; if -1, no limit.
   // if 0, the callback API is enabled to build a correlation map, but no
   // events are collected.
   uint64 max_callback_api_events = 2 * 1024 * 1024;
   // Maximum number of events to collect from activity API; if -1, no limit.
   uint64 max_activity_api_events = 2 * 1024 * 1024;
 };

 class CuptiTraceCollector {
  public:
   explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
       : options_(options) {}
   virtual ~CuptiTraceCollector() {}

   virtual void AddEvent(CuptiTracerEvent&& event) = 0;
   virtual void OnEventsDropped(const std::string& reason,
                                uint32 num_events) = 0;
   virtual void Flush() = 0;

  protected:
   CuptiTracerCollectorOptions options_;

  private:
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
 };

 class AnnotationMap {
  public:
   explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
       : max_size_(max_size), per_device_map_(num_gpus) {}
   void Add(uint32 device_id, uint32 correlation_id,
            const std::string& annotation);
   absl::string_view LookUp(uint32 device_id, uint32 correlation_id);

  private:
   struct PerDeviceAnnotationMap {
     // The population/consuption of annotations might happen from multiple
     // callback/activity api related threads.
     absl::Mutex mutex;
     // Annotation tends to be repetitive, use a hash_set to store the strings,
     // an use the reference to the string in the map.
     absl::node_hash_set<std::string> annotations;
     absl::flat_hash_map<uint32, absl::string_view> correlation_map;
   };
   const uint64 max_size_;
   absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;

   TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
 };

 class CuptiDriverApiHook {
  public:
   virtual ~CuptiDriverApiHook() {}

   virtual Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
                                   CUpti_CallbackId cbid,
                                   const CUpti_CallbackData* callback_info) = 0;
   virtual Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
                                  CUpti_CallbackId cbid,
                                  const CUpti_CallbackData* callback_info) = 0;
   virtual Status Flush() = 0;

  protected:
   static Status AddDriverApiCallbackEvent(
       CuptiTraceCollector* collector, CuptiInterface* cupti_interface,
       int device_id, uint64 start_tsc, uint64 end_tsc,
       CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
       const CUpti_CallbackData* callback_info);
 };

 // The class use to enable cupti callback/activity API and forward the collected
 // trace events to CuptiTraceCollector. There should be only one CuptiTracer
 // per process.
 class CuptiTracer {
  public:
   // Returns a pointer to singleton CuptiTracer.
   static CuptiTracer* GetCuptiTracerSingleton();

   // Only one profile session can be live in the same time.
   bool IsAvailable() const;

   void Enable(const CuptiTracerOptions& option, CuptiInterface* cupti_interface,
               CuptiTraceCollector* collector);
   void Disable();

   Status HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
                         const CUpti_CallbackData* callback_info);

   // This function is public because called from registered callback.
   Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
                                uint8_t* buffer, size_t size);

   static uint64 GetTimestamp();
   static int NumGpus();

  private:
   CuptiTracer() : num_gpus_(NumGpus()) {}

   Status EnableApiTracing();
   Status EnableActivityTracing();
   Status DisableApiTracing();
   Status DisableActivityTracing();
   Status Finalize();
   void ConfigureActivityUnifiedMemoryCounter(bool enable);

   int num_gpus_;
   absl::optional<CuptiTracerOptions> option_;
   CuptiInterface* cupti_interface_ = nullptr;
   CuptiTraceCollector* collector_ = nullptr;
   absl::optional<AnnotationMap> annotation_map_;

   bool api_tracing_enabled_ = false;
   // Cupti handle for driver or runtime API callbacks. Cupti permits a single
   // subscriber to be active at any time and can be used to trace Cuda runtime
   // as and driver calls for all contexts and devices.
   CUpti_SubscriberHandle subscriber_;  // valid when api_tracing_enabled_.

   bool activity_tracing_enabled_ = false;

   std::unique_ptr<CuptiDriverApiHook> cupti_driver_api_hook_;

   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTracer);
 };

 }  // namespace profiler
 }  // namespace tensorflow

 #endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_
	#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_

	#include "absl/container/fixed_array.h"
	#include "absl/container/flat_hash_map.h"
	#include "absl/container/node_hash_set.h"
	#include "absl/types/optional.h"
	#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
	#include "tensorflow/core/lib/core/errors.h"
	#include "tensorflow/core/lib/core/status.h"
	#include "tensorflow/core/platform/macros.h"
	#include "tensorflow/core/platform/types.h"
	#include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"

	namespace tensorflow {
	namespace profiler {

	struct MemcpyDetails {
	// The amount of data copied for memcpy events.
	size_t num_bytes;
	// The destination device for peer-2-peer communication (memcpy). The source
	// device is implicit: its the current device.
	uint32 destination;
	// Whether or not the memcpy is asynchronous.
	bool async;
	// This contains CUpti_ActivityMemcpyKind for activity event (on device).
	// For events from other CuptiTracerEventSource, it is always 0.
	int8 kind;
	};

	struct MemAllocDetails {
	// The amount of data requested for cudaMalloc events.
	uint64 num_bytes;
	};

	struct KernelDetails {
	// The number of registers used in this kernel.
	uint64 registers_per_thread;
	// The amount of shared memory space used by a thread block.
	uint64 static_shared_memory_usage;
	// The amount of dynamic memory space used by a thread block.
	uint64 dynamic_shared_memory_usage;
	// X-dimension of a thread block.
	uint64 block_x;
	// Y-dimension of a thread block.
	uint64 block_y;
	// Z-dimension of a thread block.
	uint64 block_z;
	// X-dimension of a grid.
	uint64 grid_x;
	// Y-dimension of a grid.
	uint64 grid_y;
	// Z-dimension of a grid.
	uint64 grid_z;
	};

	enum class CuptiTracerEventType {
	Unsupported = 0,
	Kernel = 1,
	MemcpyH2D = 2,
	MemcpyD2H = 3,
	MemcpyD2D = 4,
	MemcpyP2P = 5,
	MemcpyOther = 6,
	MemoryAlloc = 7,
	Overhead = 8,
	UnifiedMemory = 9,
	Generic = 100,
	};

	const char* GetTraceEventTypeName(const CuptiTracerEventType& type);

	enum class CuptiTracerEventSource {
	DriverCallback = 0,
	Activity = 1,
	// Maybe consider adding runtime callback and metric api in the future.
	};

	struct CuptiTracerEvent {
	static constexpr uint32 kInvalidThreadId =
	std::numeric_limits<uint32_t>::max();
	static constexpr uint32 kInvalidCorrelationId =
	std::numeric_limits<uint32_t>::max();
	static constexpr uint64 kInvalidContextId =
	std::numeric_limits<uint64_t>::max();
	static constexpr uint64 kInvalidStreamId =
	std::numeric_limits<uint64_t>::max();
	CuptiTracerEventType type;
	CuptiTracerEventSource source;
	// Although CUpti_CallbackData::functionName is persistent, however
	// CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
	// it.
	std::string name;
	// This points to strings in AnnotationMap, which should outlive the point
	// where serialization happens.
	absl::string_view annotation;
	uint64 start_time_ns;
	uint64 end_time_ns;
	uint32 device_id;
	uint32 correlation_id = kInvalidCorrelationId;
	uint32 thread_id = kInvalidThreadId;
	int64 context_id = kInvalidContextId;
	int64 stream_id = kInvalidStreamId;
	union {
	MemcpyDetails memcpy_info; // If type == Memcpy*
	MemAllocDetails memalloc_info; // If type == MemoryAlloc
	KernelDetails kernel_info; // If type == Kernel
	};
	};

	struct CuptiTracerOptions {
	bool enable_activity_api = true;

	// Use cuda events to enclose the kernel/memcpy to measure device activity.
	// enable_event_based_activity, if true, will override the enable_activity_api
	// setting.
	bool enable_event_based_activity = false;

	bool required_callback_api_events = true;
	// Maximum number of annotation strings that we can accommodate.
	uint64 max_annotation_strings = 1024 * 1024;
	// The callback ids that will be enabled and monitored, if empty, all
	// Callback ids to be enabled using Callback API.
	// We only care CUPTI_CB_DOMAIN_DRIVER_API domain for now. It is kind of
	// redundant to have both CUPTI_CB_DOMAIN_DRIVER_API and
	// CUPTI_CB_DOMAIN_RUNTIME_API.
	std::vector<CUpti_driver_api_trace_cbid_enum> cbids_selected;
	// Activity kinds to be collected using Activity API. If empty, the Activity
	// API is disable.
	std::vector<CUpti_ActivityKind> activities_selected;
	// Whether to call cuptiFinalize.
	bool cupti_finalize = false;
	};

	struct CuptiTracerCollectorOptions {
	// Maximum number of events to collect from callback API; if -1, no limit.
	// if 0, the callback API is enabled to build a correlation map, but no
	// events are collected.
	uint64 max_callback_api_events = 2 * 1024 * 1024;
	// Maximum number of events to collect from activity API; if -1, no limit.
	uint64 max_activity_api_events = 2 * 1024 * 1024;
	};

	class CuptiTraceCollector {
	public:
	explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
	: options_(options) {}
	virtual ~CuptiTraceCollector() {}

	virtual void AddEvent(CuptiTracerEvent&& event) = 0;
	virtual void OnEventsDropped(const std::string& reason,
	uint32 num_events) = 0;
	virtual void Flush() = 0;

	protected:
	CuptiTracerCollectorOptions options_;

	private:
	TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
	};

	class AnnotationMap {
	public:
	explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
	: max_size_(max_size), per_device_map_(num_gpus) {}
	void Add(uint32 device_id, uint32 correlation_id,
	const std::string& annotation);
	absl::string_view LookUp(uint32 device_id, uint32 correlation_id);

	private:
	struct PerDeviceAnnotationMap {
	// The population/consuption of annotations might happen from multiple
	// callback/activity api related threads.
	absl::Mutex mutex;
	// Annotation tends to be repetitive, use a hash_set to store the strings,
	// an use the reference to the string in the map.
	absl::node_hash_set<std::string> annotations;
	absl::flat_hash_map<uint32, absl::string_view> correlation_map;
	};
	const uint64 max_size_;
	absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;

	TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
	};

	class CuptiDriverApiHook {
	public:
	virtual ~CuptiDriverApiHook() {}

	virtual Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
	CUpti_CallbackId cbid,
	const CUpti_CallbackData* callback_info) = 0;
	virtual Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
	CUpti_CallbackId cbid,
	const CUpti_CallbackData* callback_info) = 0;
	virtual Status Flush() = 0;

	protected:
	static Status AddDriverApiCallbackEvent(
	CuptiTraceCollector* collector, CuptiInterface* cupti_interface,
	int device_id, uint64 start_tsc, uint64 end_tsc,
	CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
	const CUpti_CallbackData* callback_info);
	};

	// The class use to enable cupti callback/activity API and forward the collected
	// trace events to CuptiTraceCollector. There should be only one CuptiTracer
	// per process.
	class CuptiTracer {
	public:
	// Returns a pointer to singleton CuptiTracer.
	static CuptiTracer* GetCuptiTracerSingleton();

	// Only one profile session can be live in the same time.
	bool IsAvailable() const;

	void Enable(const CuptiTracerOptions& option, CuptiInterface* cupti_interface,
	CuptiTraceCollector* collector);
	void Disable();

	Status HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
	const CUpti_CallbackData* callback_info);

	// This function is public because called from registered callback.
	Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
	uint8_t* buffer, size_t size);

	static uint64 GetTimestamp();
	static int NumGpus();

	private:
	CuptiTracer() : num_gpus_(NumGpus()) {}

	Status EnableApiTracing();
	Status EnableActivityTracing();
	Status DisableApiTracing();
	Status DisableActivityTracing();
	Status Finalize();
	void ConfigureActivityUnifiedMemoryCounter(bool enable);

	int num_gpus_;
	absl::optional<CuptiTracerOptions> option_;
	CuptiInterface* cupti_interface_ = nullptr;
	CuptiTraceCollector* collector_ = nullptr;
	absl::optional<AnnotationMap> annotation_map_;

	bool api_tracing_enabled_ = false;
	// Cupti handle for driver or runtime API callbacks. Cupti permits a single
	// subscriber to be active at any time and can be used to trace Cuda runtime
	// as and driver calls for all contexts and devices.
	CUpti_SubscriberHandle subscriber_; // valid when api_tracing_enabled_.

	bool activity_tracing_enabled_ = false;

	std::unique_ptr<CuptiDriverApiHook> cupti_driver_api_hook_;

	TF_DISALLOW_COPY_AND_ASSIGN(CuptiTracer);
	};

	} // namespace profiler
	} // namespace tensorflow

	#endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_TRACER_H_