| #pragma once |
| |
| #include <c10/macros/Export.h> |
| |
| #include <fusion.h> |
| #include <ir_base_nodes.h> |
| #include <ir_builder.h> |
| #include <lower_sync_information.h> |
| #include <lower_warp_reduce.h> |
| #include <parallel_dimension_map.h> |
| #include <utils.h> |
| #include <vectorization_info.h> |
| |
| #include <memory> |
| #include <unordered_map> |
| #include <utility> |
| #include <vector> |
| |
| namespace torch { |
| namespace jit { |
| namespace fuser { |
| namespace cuda { |
| namespace kir { |
| |
| //! Summary of interesting facts about the kernel |
| // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) |
| struct KernelSummary { |
| //! Count of WAR (write-after-read) hazard barriers |
| int war_hazard_syncs_count = 0; |
| |
| //! List of global buffers |
| std::vector<const kir::Allocate*> global_allocations; |
| |
| //! List of dynamic shared memory buffers |
| std::vector<const kir::Allocate*> dynamic_smem_allocations; |
| |
| //! List of static shared memory buffers |
| std::vector<const kir::Allocate*> static_smem_allocations; |
| |
| //! Indicate the need to generate random numbers |
| int max_rng_offsets = -1; |
| |
| //! Do we have any block reductions? |
| bool has_block_reductions = false; |
| |
| //! Number of static grid reductions |
| bool has_grid_reductions = false; |
| |
| //! Do we have any grid reduction in a loop, or grid reductions dependent on |
| //! grid reductions |
| bool has_cooperative_grid_reduction = false; |
| |
| //! Do we have any block broadcasts? |
| bool has_block_broadcasts = false; |
| |
| //! Do we have any grid broadcasts? |
| bool has_grid_broadcasts = false; |
| |
| //! Do we have any welford op? |
| bool has_welford = false; |
| |
| //! Do we have any welford op? |
| bool has_block_welford = false; |
| |
| //! Do we have any welford op? |
| bool has_grid_welford = false; |
| |
| //! Largest shared memory buffer base type |
| DataType largest_smem_data_type = DataType::Null; |
| |
| //! Do we have allocations of dynamic local memory? |
| bool has_dynamic_local_memory_allocations = false; |
| |
| //! List of dynamic local memory buffers. |
| //! Only used for debugging. |
| std::vector<const kir::Allocate*> dynamic_lmem_allocations; |
| |
| //! ceilDiv extents that must be divisible |
| std::vector<std::pair<const Val*, const Val*>> splits_to_validate; |
| |
| //! Effective ParallelTypes of broadcast ops |
| std::unordered_map<const BroadcastOp*, ParallelTypeBitmap> |
| broadcast_parallel_types; |
| |
| //! Track which tensor views are inputs or outputs of a vectorized operation |
| //! and their maximum vectorized access size |
| std::unordered_map<TensorView*, int> vectorized_accesses; |
| |
| // Sync map is needed to figure out if global memory buffers need to be marked |
| // as volatile because they're used for communication. |
| SyncMap sync_map; |
| |
| // Parallel dimension map needed to set the correct properties of grid buffers |
| // (is a dim inactive) |
| ParallelDimensionMap parallel_dimension_map_; |
| |
| //! Track information on vectorized set operations for runtime validation |
| std::vector<VectorizedSetInfo> vectorized_set_info; |
| }; |
| |
| class TORCH_CUDA_CU_API KernelPerformanceProfile { |
| public: |
| //! Register an expression to profile |
| void registerExpr(const Expr* expr); |
| |
| //! Query if an expression is profiled |
| bool isProfiled(const Expr* expr) const; |
| |
| //! Get the number of profiled expressions |
| int getNumberOfProfileEntries() const { |
| return num_profile_entries_; |
| } |
| |
| //! Set the backing buffer of profile. |
| void setBuffer(TensorView* buffer) { |
| buffer_ = buffer; |
| } |
| |
| //! Get the backing buffer |
| TensorView* getBuffer() const { |
| return buffer_; |
| } |
| |
| //! Get the indices of the profile of an expression in the backing buffer |
| std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const; |
| |
| std::string toString(const at::Tensor& buffer) const; |
| |
| private: |
| //! Get the new profile index |
| int getNewIndex(); |
| |
| //! Get the profile index |
| c10::optional<int> getIndex(const Expr* expr) const; |
| |
| private: |
| int num_profile_entries_ = 0; |
| |
| //! Backing buffer of Nx2 integer tensor, where N is the number of profiled |
| //! regions. Each region has two integer values, one representing |
| //! the cycles spent, and another the count. |
| TensorView* buffer_ = nullptr; |
| |
| //! Map profiled expressions to profile entry offsets |
| std::unordered_map<const Expr*, int> expr_entry_map_; |
| |
| // TODO: Allow profiling of ForLoops |
| //! Map profiled ForLoop to profile entry offsets |
| // std::unordered_map<const kir::ForLoop*, int> loop_entry_map_; |
| }; |
| |
| class KernelInternalProxy; |
| |
| //! Container for a lowered Kernel IR |
| //! |
| // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) |
| class TORCH_CUDA_CU_API Kernel final : public Fusion { |
| friend KernelInternalProxy; |
| |
| public: |
| // Kernel starts by grabbing all the nodes from the provided fusion. |
| // Kernel is not SSA, if a definition is not set, we should update it, but |
| // not remove previous definition if it is set. This is primarily because when |
| // we do something like generate an initialization statement for a reduction |
| // TV, we may want to continue to do fusion like analysis on the original |
| // expression. |
| // TODO: Assert index type is int or int32 |
| Kernel(Fusion* fusion, DataType index_type = DataType::Int) |
| : Fusion(*fusion), index_type_(index_type) {} |
| |
| Kernel() = delete; |
| |
| // No move or copy semantics |
| Kernel(const Kernel&) = delete; |
| Kernel& operator=(const Kernel&) = delete; |
| |
| //! Finalize a kernel definition |
| //! |
| //! At this point we have a complete kernel definition and we can |
| //! run analysis passes to build a KernelSummary. |
| void finalize(std::vector<Expr*> top_level_exprs); |
| |
| const std::vector<Expr*>& topLevelExprs() const { |
| return top_level_exprs_; |
| } |
| |
| const KernelSummary& summary() const { |
| return summary_; |
| } |
| |
| DataType indexType() const { |
| return index_type_; |
| } |
| |
| //! Checks if parallel type is padded |
| bool isParallelTypePadded(ParallelType ptype) const { |
| return ptype == ParallelType::TIDx && |
| warp_padded_parallel_info_.is_tidx_padded; |
| } |
| |
| const WarpPaddedParallelInfo& getWarpPaddedParallelInfo() const { |
| return warp_padded_parallel_info_; |
| } |
| |
| const KernelPerformanceProfile& profile() const { |
| return profile_; |
| } |
| |
| //! Debug dump of the Kernel IR |
| void print() const; |
| |
| protected: |
| //! Register the Val with this fusion |
| void registerVal(Val* val) override; |
| |
| //! Register expr with this fusion. |
| //! When we register an expression, we want to update the dependency tracking |
| //! of Vals. We add expr to our general expr_set_, |
| void registerExpr(Expr* expr) override; |
| |
| private: |
| // Analyze the kernel IR and caches the summary of interesting data |
| void analyze(); |
| |
| // Top level statements |
| std::vector<Expr*> top_level_exprs_; |
| |
| // Summary of interesting kernel data |
| KernelSummary summary_; |
| |
| // Is this kernel being compiled with int32 or int64 indexing. This |
| // information is required to resolve DataType::Index |
| DataType index_type_ = DataType::Int; |
| |
| WarpPaddedParallelInfo warp_padded_parallel_info_; |
| |
| KernelPerformanceProfile profile_; |
| }; |
| |
| //! A special debugging proxy for Kernel. |
| //! |
| //! Should not be used for other than testing and debugging. |
| class TORCH_CUDA_CU_API KernelInternalProxy { |
| public: |
| KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {} |
| |
| std::vector<Expr*>& topLevelExprs(); |
| |
| private: |
| Kernel* kernel_ = nullptr; |
| }; |
| |
| } // namespace kir |
| } // namespace cuda |
| } // namespace fuser |
| } // namespace jit |
| } // namespace torch |