tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h - platform/external/tensorflow - Git at Google

 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_

 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"

 namespace xla {
 namespace llvm_ir {

 // About 0-2-1 transpose:
 //
 // If a shape can be viewed as three logical components 0-1-2 in the order of
 // major to minor, a 0-2-1-transpose changes the order of such logical
 // components to 0-2-1. We call the shape being transposed the input shape and
 // the transposed shape the output shape. The logical view of the input/output
 // shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
 // shapes. The original input/output shapes are called unnormalized shapes.
 //
 // If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
 // normalized shape of `b` or the 0-2-1 shape.
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
                                                      const Shape& b);

 // A tile is a spatial subdivision of a tensor. We group tensor elements into
 // tiles so that we can launch kernels to process the tensor elements in blocks
 // of tiles.
 //
 // A kernel mapping scheme describes a method to partition the tensors accessed
 // by an unnested HLO instruction into tiles and blocks of tiles, and the
 // associated information to use hardware threads to process the tensor elements
 // in blocks of tiles.
 //
 // Currently, there are two main use cases for a tiling scheme. First, we
 // implement kernels with 0-2-1 memory transpose using shared memory to improve
 // memory access pattern. Second, we implement reduction to contiguous
 // dimensions in layout, with or without memory tranpsose, to achieve better
 // memory access pattern as well as to reduce the need numbers of executed
 // expensive instructions, such as thread synchronization related instructions
 // and atomic operations. For both use cases, we can apply a normalization to
 // the original tensors, to collapse contiguous dimensions for the same purpose
 // and produce normlized three dimensional tensors. For this reason, the tiling
 // scheme class only needs to handle normalized three dimensional tensors and
 // two dimensional tiles.
 //
 // The current implementation of the class is somewhat NVIDIA GPU oriented. This
 // situation can be improved when there is a need though. The idea of 0-2-1
 // transpose using shared memory can be found in the following CUDA algorithm in
 // TensorFlow: https://goo.gl/MStRV6.
 //
 // We use a thread block to process a tile because we want to use the HW thread
 // block synchronization primitives to synchronize the processing of all the
 // elements in the same tile. A thread block can be viewed as a two dimensional
 // array of threads, described by the number of threads for the Y and X
 // dimensions. A thread block (num_threads_y, num_threads_x) processes a tile of
 // (tile_size_y, tile_size_x) as follows: each thread in the thread block
 // processes one element in the tile so that all the threads in the thread block
 // together process a subdivision of the tile that has the same dimension as the
 // thread block array. Then the thread block moves on to process the next
 // subdivision of the tile until the whole tile is processed. Therefore, each
 // thread in the thread block processes
 // tile_size_x/num_threads_x * tile_size_y/num_threads_y elements in a tile.
 //
 // There are situations where we want a thread block to process multiple
 // tiles. We can't group those tiles into a bigger tiles because we limit a tile
 // to a two dimensional spatial subdivision of a tensor. For example, when we
 // use tiling to implement reduction with tranpose, we want the partial sum
 // produced by each thread to accumulate values for more elements before using
 // shlf_down and atomic_add instructions for further reduction, to amortize the
 // cost of such expensive instructions. The concept of tile block is introduced
 // for this purpose. A tile block is a three dimensional array of tiles, of
 // which some dimensions may be degenerated to only one tile.
 class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };

  public:
   KernelMappingScheme() {}
   // dims_in_elems: the normalized tensor dimensions.
   // req_block_sizes: the requested block size in number of tiles for each
   //   dimension. The actual block size is set to min(req_block_size,
   //   dims_in_number_of_blocks).
   KernelMappingScheme(absl::Span<const int64> dims_in_elems, int64 tile_size_y,
                       int64 tile_size_x,
                       absl::Span<const int64> req_block_sizes,
                       int64 num_threads_y, int64 num_threads_x,
                       llvm::IRBuilder<>* b);

   absl::Span<const int64> GetDimensionsInElements() const {
     return dims_in_elems_;
   }
   absl::Span<const int64> GetDimensionsInTiles() const {
     return dims_in_tiles_;
   }
   absl::Span<const int64> GetDimensionsInBlocks() const {
     return dims_in_blocks_;
   }

   int64 GetNumberOfTilesInTotal() const {
     return absl::c_accumulate(dims_in_tiles_, 1LL, std::multiplies<int64>());
   }
   int64 GetNumberOfTilesInOneBlock() const {
     return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
   }
   int64 GetNumberOfTilesInOneBlockForDimension(int d) const {
     DCHECK(d >= DimZ && d <= DimX);
     return block_sizes_[d];
   }
   int64 GetNumberOfBlocks() const {
     return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
   }

   int64 GetTileSizeForDimension(int d) const { return tile_sizes_.at(d); }
   int64 GetTileSizeForDimensionX() const {
     return GetTileSizeForDimension(DimX);
   }
   int64 GetTileSizeForDimensionY() const {
     return GetTileSizeForDimension(DimY);
   }

   absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
   int64 GetTileBlockSizeForDimension(int d) const {
     return dims_in_blocks_.at(d);
   }

   int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
   int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }

   int64 GetThreadsPerBlock() const {
     return GetNumberOfThreadsForDimensionX() *
            GetNumberOfThreadsForDimensionY();
   }

   bool DilatedX() const { return dilated_x_; }
   void SetDilatedX(bool v) {
     dilated_x_ = v;
     if (!dilated_x_) {
       // dilated_x_=false is for the purpose of vectorization, which requires
       // GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
       CHECK_EQ(GetTileSizeForDimension(DimX) % num_threads_x_, 0);
     }
   }

   IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
   // Returns the index for the first tile in the block with the given block
   // index.
   IrArray::Index GetTileIndexForBlockOrigin(const IrArray::Index& block_index);
   // Returns the index for the first element in the tile with the given tile
   // index.
   IrArray::Index GetElementIndexForTileOrigin(const IrArray::Index& tile_index);

   std::tuple<llvm::Value*, llvm::Value*> EmitThreadYXCoordinate(
       llvm::Type* index_ty);

   IrArray::Index GetUnnormalizedIndex(
       const IrArray::Index& normalized_shape_index,
       const Shape& unnormalized_shape);

   llvm::GlobalVariable* GetSharedMemoryBufferForElementType(
       llvm::Type* elem_ty, absl::string_view buffer_name);

  private:
   llvm::IRBuilder<>* b_;
   // The number of elements in each dimension.
   std::array<int64, 3> dims_in_elems_;

   // The number of elements for each dimension of a tile.
   std::array<int64, 3> tile_sizes_;
   // The number of tiles in each dimension. It is computed from dims_in_elem_
   // and tile_sizes_.
   std::array<int64, 3> dims_in_tiles_;

   // The number of tiles for each dimension of a tile block.
   std::array<int64, 3> block_sizes_;
   // The number of blocks in each dimension of a tile block. It is computed from
   // dims_in_tile_ and block_sizes_.
   std::array<int64, 3> dims_in_blocks_;

   // Number of threads used to process elements in the X direction of a tile.
   int64 num_threads_x_;
   // Number of threads used to process elements in the Y direction of a tile.
   int64 num_threads_y_;

   // When num_threads_x threads process a total of tile_size_x elements in the
   // X dimension of a tile, each threads process n=tile_size_x/num_threads_x
   // elements. When dilated_x=false, the n elements processed by a thread are
   // contiguous. On the other hand, when dilated_x=true the n elements are
   // dilated by a factor of num_threads_x.
   bool dilated_x_;
 };

 }  // namespace llvm_ir
 }  // namespace xla

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_
	/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_
	#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_

	#include "llvm/IR/Value.h"
	#include "tensorflow/compiler/xla/service/hlo_instruction.h"
	#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"

	namespace xla {
	namespace llvm_ir {

	// About 0-2-1 transpose:
	//
	// If a shape can be viewed as three logical components 0-1-2 in the order of
	// major to minor, a 0-2-1-transpose changes the order of such logical
	// components to 0-2-1. We call the shape being transposed the input shape and
	// the transposed shape the output shape. The logical view of the input/output
	// shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
	// shapes. The original input/output shapes are called unnormalized shapes.
	//
	// If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
	// normalized shape of `b` or the 0-2-1 shape.
	absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
	const Shape& b);

	// A tile is a spatial subdivision of a tensor. We group tensor elements into
	// tiles so that we can launch kernels to process the tensor elements in blocks
	// of tiles.
	//
	// A kernel mapping scheme describes a method to partition the tensors accessed
	// by an unnested HLO instruction into tiles and blocks of tiles, and the
	// associated information to use hardware threads to process the tensor elements
	// in blocks of tiles.
	//
	// Currently, there are two main use cases for a tiling scheme. First, we
	// implement kernels with 0-2-1 memory transpose using shared memory to improve
	// memory access pattern. Second, we implement reduction to contiguous
	// dimensions in layout, with or without memory tranpsose, to achieve better
	// memory access pattern as well as to reduce the need numbers of executed
	// expensive instructions, such as thread synchronization related instructions
	// and atomic operations. For both use cases, we can apply a normalization to
	// the original tensors, to collapse contiguous dimensions for the same purpose
	// and produce normlized three dimensional tensors. For this reason, the tiling
	// scheme class only needs to handle normalized three dimensional tensors and
	// two dimensional tiles.
	//
	// The current implementation of the class is somewhat NVIDIA GPU oriented. This
	// situation can be improved when there is a need though. The idea of 0-2-1
	// transpose using shared memory can be found in the following CUDA algorithm in
	// TensorFlow: https://goo.gl/MStRV6.
	//
	// We use a thread block to process a tile because we want to use the HW thread
	// block synchronization primitives to synchronize the processing of all the
	// elements in the same tile. A thread block can be viewed as a two dimensional
	// array of threads, described by the number of threads for the Y and X
	// dimensions. A thread block (num_threads_y, num_threads_x) processes a tile of
	// (tile_size_y, tile_size_x) as follows: each thread in the thread block
	// processes one element in the tile so that all the threads in the thread block
	// together process a subdivision of the tile that has the same dimension as the
	// thread block array. Then the thread block moves on to process the next
	// subdivision of the tile until the whole tile is processed. Therefore, each
	// thread in the thread block processes
	// tile_size_x/num_threads_x * tile_size_y/num_threads_y elements in a tile.
	//
	// There are situations where we want a thread block to process multiple
	// tiles. We can't group those tiles into a bigger tiles because we limit a tile
	// to a two dimensional spatial subdivision of a tensor. For example, when we
	// use tiling to implement reduction with tranpose, we want the partial sum
	// produced by each thread to accumulate values for more elements before using
	// shlf_down and atomic_add instructions for further reduction, to amortize the
	// cost of such expensive instructions. The concept of tile block is introduced
	// for this purpose. A tile block is a three dimensional array of tiles, of
	// which some dimensions may be degenerated to only one tile.
	class KernelMappingScheme {
	public:
	enum { DimZ = 0, DimY, DimX, DimTot };

	public:
	KernelMappingScheme() {}
	// dims_in_elems: the normalized tensor dimensions.
	// req_block_sizes: the requested block size in number of tiles for each
	// dimension. The actual block size is set to min(req_block_size,
	// dims_in_number_of_blocks).
	KernelMappingScheme(absl::Span<const int64> dims_in_elems, int64 tile_size_y,
	int64 tile_size_x,
	absl::Span<const int64> req_block_sizes,
	int64 num_threads_y, int64 num_threads_x,
	llvm::IRBuilder<>* b);

	absl::Span<const int64> GetDimensionsInElements() const {
	return dims_in_elems_;
	}
	absl::Span<const int64> GetDimensionsInTiles() const {
	return dims_in_tiles_;
	}
	absl::Span<const int64> GetDimensionsInBlocks() const {
	return dims_in_blocks_;
	}

	int64 GetNumberOfTilesInTotal() const {
	return absl::c_accumulate(dims_in_tiles_, 1LL, std::multiplies<int64>());
	}
	int64 GetNumberOfTilesInOneBlock() const {
	return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
	}
	int64 GetNumberOfTilesInOneBlockForDimension(int d) const {
	DCHECK(d >= DimZ && d <= DimX);
	return block_sizes_[d];
	}
	int64 GetNumberOfBlocks() const {
	return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
	}

	int64 GetTileSizeForDimension(int d) const { return tile_sizes_.at(d); }
	int64 GetTileSizeForDimensionX() const {
	return GetTileSizeForDimension(DimX);
	}
	int64 GetTileSizeForDimensionY() const {
	return GetTileSizeForDimension(DimY);
	}

	absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
	int64 GetTileBlockSizeForDimension(int d) const {
	return dims_in_blocks_.at(d);
	}

	int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
	int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }

	int64 GetThreadsPerBlock() const {
	return GetNumberOfThreadsForDimensionX() *
	GetNumberOfThreadsForDimensionY();
	}

	bool DilatedX() const { return dilated_x_; }
	void SetDilatedX(bool v) {
	dilated_x_ = v;
	if (!dilated_x_) {
	// dilated_x_=false is for the purpose of vectorization, which requires
	// GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
	CHECK_EQ(GetTileSizeForDimension(DimX) % num_threads_x_, 0);
	}
	}

	IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
	// Returns the index for the first tile in the block with the given block
	// index.
	IrArray::Index GetTileIndexForBlockOrigin(const IrArray::Index& block_index);
	// Returns the index for the first element in the tile with the given tile
	// index.
	IrArray::Index GetElementIndexForTileOrigin(const IrArray::Index& tile_index);

	std::tuple<llvm::Value, llvm::Value> EmitThreadYXCoordinate(
	llvm::Type* index_ty);

	IrArray::Index GetUnnormalizedIndex(
	const IrArray::Index& normalized_shape_index,
	const Shape& unnormalized_shape);

	llvm::GlobalVariable* GetSharedMemoryBufferForElementType(
	llvm::Type* elem_ty, absl::string_view buffer_name);

	private:
	llvm::IRBuilder<>* b_;
	// The number of elements in each dimension.
	std::array<int64, 3> dims_in_elems_;

	// The number of elements for each dimension of a tile.
	std::array<int64, 3> tile_sizes_;
	// The number of tiles in each dimension. It is computed from dims_in_elem_
	// and tile_sizes_.
	std::array<int64, 3> dims_in_tiles_;

	// The number of tiles for each dimension of a tile block.
	std::array<int64, 3> block_sizes_;
	// The number of blocks in each dimension of a tile block. It is computed from
	// dims_in_tile_ and block_sizes_.
	std::array<int64, 3> dims_in_blocks_;

	// Number of threads used to process elements in the X direction of a tile.
	int64 num_threads_x_;
	// Number of threads used to process elements in the Y direction of a tile.
	int64 num_threads_y_;

	// When num_threads_x threads process a total of tile_size_x elements in the
	// X dimension of a tile, each threads process n=tile_size_x/num_threads_x
	// elements. When dilated_x=false, the n elements processed by a thread are
	// contiguous. On the other hand, when dilated_x=true the n elements are
	// dilated by a factor of num_threads_x.
	bool dilated_x_;
	};

	} // namespace llvm_ir
	} // namespace xla

	#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_