ruy/mat.h - platform/external/ruy - Git at Google

 /* Copyright 2019 Google LLC. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 // Internal types and helpers for matrices.
 // "Mat" is the name we use to refer to our internal matrix classes; it can be
 // thought of as a shorter version of "InternalMatrix"`
 //
 // Ruy has four internal matrix classes, besides the
 // Matrix<T> class that we expose to the user-facing API.
 //
 // TODO(silvasean): Put parts of this architecture description somewhere more
 // prominent.
 //
 // The 4 internal matrix classes are named Mat, EMat, PMat, PEMat, where:
 // - "E" indicates a type-erased class, storing a void* pointer and a runtime
 //   enum value to track the scalar type, as opposed to being templatized
 //   on a Scalar type and storing a Scalar* pointer.
 // - "P" indicates a packed matrix class, the output of the packing code and
 //   input of the kernel code. See comments in pack.h regarding packing.
 //
 // In other words:
 //
 //                Plain matrices   Packed matrices
 //             +----------------------------------
 // Templated   |  Mat, Matrix      PMat
 // Type-erased |  EMat             PEMat
 //
 // Note that Matrix<T> is *not* implemented in terms of the internal types. It
 // is an independent, simple, and user-facing type. Matrix<T> is functionally
 // equivalent to Mat, but we keep it separate to insulate internals from
 // interface and to be able to make different compromises in internals
 // vs interface: in internals we prefer Mat to be a C-style struct with
 // raw data member access and to be similar to the other PMat/EMat/PEMat
 // classes for consistency.
 //
 // The use of type-erasure might seem surprising for a library like Ruy with a
 // heavily-templated entry point, but it is motivated by the desire for most of
 // Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
 // main parts:
 // - "entry-point" (ruy.h) - this is the highly templated ruy::Mul entry
 // point.
 // - "front-end" (frontend.*, validate.*, create_trmul_params.*,
 // prepare_packed_matrices.*) - the work to handle the entry-point call down to
 // the point where it can be handed off to the middle/back ends below. That
 // includes routines that select RunKernel and RunPack
 // implementations statically based on those template parameters.
 // - "back-end" (kernel_*.*, pack_*.*)- this consists of the implementations of
 // RunKernel and RunPack, often in assembly code, which are the building blocks
 // that Ruy calls to perform matrix multiplication.  These are templated so that
 // only the requested types/Path's are actually emitted by the compiler.
 // - "middle-end" (trmul.*) - this is the part of Ruy that orchestrates the
 // calls to the "back-end" optimized building blocks. This layer has to deal
 // with issues like cache locality and low-overhead multi-threading.
 //
 // There is a desire for the "middle-end" to be non-templated in order to
 // simplify the implementation and reduce code-size. We type-erase when going
 // from the "front-end" to the "middle-end", and un-type-erase going from the
 // "middle-end" to the "back-end". The un-type-erasure is possible because the
 // "front-end" is responsible for instantiating the needed "back-end" templates,
 // and thus the static type information is still present.
 //
 // Each layer of Ruy uses matrix types:
 // - "entry-point": Matrix<T>
 // - "front-end": Mat
 // - "middle-end": EMat, PEMat
 // - "back-end": Mat, PMat
 //
 // The use of separate types for packed matrices is not essential, but makes it
 // obvious at a glance whether a matrix is a packed matrix or not. We would
 // reconsider this decision if there was significant duplication between packed
 // and unpacked matrices, but that doesn't seem to be the case at the moment.
 //
 // Another goal is to keep the user-facing Matrix<T> as simple and
 // understandable as possible. Ideally, a user should be able to read the struct
 // definition for Matrix<T> and see a very simple definition with no internal
 // details like sums and kernel block layout.

 #ifndef RUY_RUY_INTERNAL_MATRIX_H_
 #define RUY_RUY_INTERNAL_MATRIX_H_

 #include <cstddef>
 #include <cstdint>
 #include <type_traits>
 #include <utility>

 #include "ruy/check_macros.h"
 #include "ruy/matrix.h"
 #include "ruy/size_util.h"

 namespace ruy {

 // Internal counterpart of Layout, used by Mat.
 struct MatLayout final {
   std::int32_t rows = 0;
   std::int32_t cols = 0;
   // Stride is the offset between two adjacent matrix elements
   // in the non-contiguous direction.
   std::int32_t stride = 0;
   Order order = Order::kColMajor;
 };

 inline MatLayout ToInternal(const Layout& src) {
   MatLayout ret;
   ret.rows = src.rows();
   ret.cols = src.cols();
   ret.stride = src.stride();
   ret.order = src.order();
   return ret;
 }

 // Internal counterpart of Matrix
 template <typename Scalar>
 struct Mat final {
   detail::ConstCheckingPtr<Scalar> data;
   MatLayout layout;
   Scalar zero_point = 0;
   CachePolicy cache_policy = CachePolicy::kNeverCache;
 };

 template <typename Scalar>
 inline Mat<Scalar> ToInternal(const Matrix<Scalar>& src) {
   Mat<Scalar> ret;
   ret.data.set(src.data());
   ret.layout = ToInternal(src.layout());
   ret.zero_point = src.zero_point();
   ret.cache_policy = src.cache_policy();
   return ret;
 }

 template <typename Scalar>
 inline Mat<Scalar> ToInternal(Matrix<Scalar>& src) {
   Mat<Scalar> ret;
   ret.data.set(src.data());
   ret.layout = ToInternal(src.layout());
   ret.zero_point = src.zero_point();
   ret.cache_policy = src.cache_policy();
   return ret;
 }

 // KernelLayout describes small-scale block structure in a packed matrix layout.
 // It's a runtime (as opposed to compile-time-constant) version of the
 // FixedKernelLayout struct used to declare kernel layouts.
 //
 // This is is sometimes known as "tiling" in other contexts.
 //
 // For example, consider a packed matrix in column-major format with a
 // column-major KernelLayout. The matrix logically has a shape of
 // `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
 // of shape `[cols / kcols, rows / krows, kcols, krows]`.
 //
 // Note that in the case of kcols=1, krows=1, this degenerates to
 // `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
 // structure.
 struct KernelLayout final {
   Order order = Order::kColMajor;
   std::uint8_t rows = 1;
   std::uint8_t cols = 1;
 };

 // A packed matrix has a small-scale block structure that is not present in in
 // the input matrices. This block structure is necessary for the kernels to
 // process data efficiently.
 //
 // This struct is very similar to MatLayout, but has the extra KernelLayout
 // field.
 struct PMatLayout final {
   std::int32_t rows = 0;
   std::int32_t cols = 0;
   // Stride is the offset between two adjacent matrix elements
   // in the non-contiguous direction.
   std::int32_t stride = 0;
   Order order = Order::kColMajor;
   // Small scale layout shuffling, potentially departing from
   // linear row-major or column-major storage. See KernelLayout.
   KernelLayout kernel;
 };

 inline bool operator==(const PMatLayout& a, const PMatLayout& b) {
   return a.cols == b.cols && a.rows == b.rows && a.stride == b.stride &&
          a.order == b.order && a.kernel.rows == b.kernel.rows &&
          a.kernel.cols == b.kernel.cols && a.kernel.order == b.kernel.order;
 }

 // Dynamic representation for a type.
 //
 // The most important field in this struct is the size, which Ruy uses to know
 // how much memory to allocate without having to be templated on a type.
 // Signed-ness and floating-point-ness are mainly present as debugging checks.
 //
 // Note: Ruy does not use this struct to to dynamically dispatch between
 // different typed implementations. As described in the comment at the top of
 // this file, Ruy's "front-end", which is templated, instantiates all the
 // necessary "back-end" routines with complete static knowledge of all the
 // types.
 struct Type final {
   template <typename T>
   static Type Create() {
     Type ret;
     ret.is_signed = std::is_signed<T>::value;
     ret.is_floating_point = std::is_floating_point<T>::value;
     ret.size = sizeof(T);
     return ret;
   }

   template <typename T>
   void AssertIs() const {
     RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
     RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
     RUY_DCHECK_EQ(size, Create<T>().size);
   }

   bool is_signed = false;
   bool is_floating_point = false;
   std::uint8_t size = 0;
 };

 inline bool operator==(const Type& type1, const Type& type2) {
   return type1.is_signed == type2.is_signed &&
          type1.is_floating_point == type2.is_floating_point &&
          type1.size == type2.size;
 }

 // Type-erased matrix.
 struct EMat final {
   Type data_type;
   void* data = nullptr;
   MatLayout layout;
   std::int32_t zero_point = 0;
   CachePolicy cache_policy = CachePolicy::kNeverCache;
 };

 // Type-erased packed matrix.
 struct PEMat final {
   Type data_type;
   void* data = nullptr;
   Type sums_type;
   void* sums = nullptr;
   PMatLayout layout;
   std::int32_t zero_point = 0;
 };

 // Convenient typed helper for packed matrices.
 template <typename Scalar>
 struct PMat final {
   // The row/column sums needed for quantized matrix multiplication when
   // the opposite operand of the multiplication uses a non-symmetric zero
   // point.
   // This member is only relevant for packed matrices.
   // Additionally, Ruy always uses 32-bit signed accumulators for quantized
   // matrix multiplication.
   // For floating point types, there is no quantization, so this pointer
   // will always be null. We still need code referencing it to compile
   // though, even if it is always branched around. Hence we use Scalar*
   // itself as the type in that case.
   using SumsType =
       typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
                                 std::int32_t>::type;

   Scalar* data = nullptr;
   SumsType* sums = nullptr;
   PMatLayout layout;
   std::int32_t zero_point = 0;
 };

 template <typename T>
 EMat EraseType(const Mat<T>& matrix) {
   EMat ret;
   ret.data_type = Type::Create<T>();
   ret.data = const_cast<void*>(static_cast<const void*>(matrix.data.get()));
   ret.layout = matrix.layout;
   ret.zero_point = matrix.zero_point;
   ret.cache_policy = matrix.cache_policy;
   return ret;
 }

 template <typename T>
 Mat<T> UneraseType(const EMat& matrix) {
   matrix.data_type.AssertIs<T>();
   Mat<T> ret;
   ret.data.set(static_cast<T*>(matrix.data));
   ret.layout = matrix.layout;
   ret.zero_point = matrix.zero_point;
   ret.cache_policy = matrix.cache_policy;
   return ret;
 }

 template <typename T>
 PMat<T> UneraseType(const PEMat& matrix) {
   using SumsType = typename PMat<T>::SumsType;
   matrix.data_type.AssertIs<T>();
   matrix.sums_type.AssertIs<SumsType>();
   PMat<T> ret;
   ret.data = static_cast<T*>(matrix.data);
   ret.sums = static_cast<SumsType*>(matrix.sums);
   ret.layout = matrix.layout;
   ret.zero_point = matrix.zero_point;
   return ret;
 }

 // Helpers for MatLayout / PMatLayout.

 inline bool IsUnstrided(const MatLayout& layout) {
   if (layout.order == Order::kColMajor) {
     return layout.stride == layout.rows;
   } else {
     return layout.stride == layout.cols;
   }
 }

 inline bool IsRowMajor(const MatLayout& layout) {
   return layout.order == Order::kRowMajor;
 }

 inline bool IsColMajor(const MatLayout& layout) {
   return layout.order == Order::kColMajor;
 }

 inline int FlatSize(const MatLayout& layout) {
   const int outerdim =
       layout.order == Order::kColMajor ? layout.cols : layout.rows;
   return layout.stride * outerdim;
 }

 inline bool IsUnstrided(const PMatLayout& layout) {
   if (layout.order == Order::kColMajor) {
     return layout.stride == layout.rows;
   } else {
     return layout.stride == layout.cols;
   }
 }

 inline bool IsRowMajor(const PMatLayout& layout) {
   return layout.order == Order::kRowMajor;
 }

 inline bool IsColMajor(const PMatLayout& layout) {
   return layout.order == Order::kColMajor;
 }

 inline int FlatSize(const PMatLayout& layout) {
   const int outerdim =
       layout.order == Order::kColMajor ? layout.cols : layout.rows;
   return layout.stride * outerdim;
 }

 // TODO(b/130417400) add a unit test
 inline int Offset(const MatLayout& layout, int row, int col) {
   // TODO(benoitjacob)  - should check this but this make the _slow tests take
   // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
   // bypassing the check?
   // RUY_DCHECK_GE(row, 0);
   // RUY_DCHECK_GE(col, 0);
   // RUY_DCHECK_LT(row, layout.rows);
   // RUY_DCHECK_LT(col, layout.cols);
   int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
   int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
   return row * row_stride + col * col_stride;
 }

 // TODO(b/130417400) add a unit test
 inline int Offset(const PMatLayout& layout, int row, int col) {
   RUY_DCHECK(is_pot(layout.kernel.rows));
   RUY_DCHECK(is_pot(layout.kernel.cols));
   int row_outer = row & ~(layout.kernel.rows - 1);
   int col_outer = col & ~(layout.kernel.cols - 1);
   int row_stride_outer =
       layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
   int col_stride_outer =
       layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
   int offset_outer =
       row_outer * row_stride_outer + col_outer * col_stride_outer;
   int row_inner = row - row_outer;
   int col_inner = col - col_outer;
   int row_stride_inner =
       layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
   int col_stride_inner =
       layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
   int offset_inner =
       row_inner * row_stride_inner + col_inner * col_stride_inner;
   return offset_outer + offset_inner;
 }

 // Helpers for Mat<T>.

 template <typename Scalar>
 const Scalar* ElementPtr(const Mat<Scalar>& mat, int row, int col) {
   return mat.data.get() + Offset(mat.layout, row, col);
 }

 template <typename Scalar>
 Scalar* ElementPtr(Mat<Scalar>* mat, int row, int col) {
   return mat->data.get() + Offset(mat->layout, row, col);
 }

 template <typename Scalar>
 Scalar Element(const Mat<Scalar>& mat, int row, int col) {
   return *ElementPtr(mat, row, col);
 }

 // Helpers for PMat<T>.
 // Duplicated from Matrix<T>, but the duplication seems acceptable.

 template <typename Scalar>
 const Scalar* ElementPtr(const PMat<Scalar>& mat, int row, int col) {
   return mat.data + Offset(mat.layout, row, col);
 }

 template <typename Scalar>
 Scalar* ElementPtr(PMat<Scalar>* mat, int row, int col) {
   return mat->data + Offset(mat->layout, row, col);
 }

 template <typename Scalar>
 Scalar Element(const PMat<Scalar>& mat, int row, int col) {
   return *ElementPtr(mat, row, col);
 }

 // Helpers for PEMat.

 inline int DataBytes(const PEMat& packed) {
   return FlatSize(packed.layout) * packed.data_type.size;
 }

 inline int SumsBytes(const PEMat& packed) {
   // Packed matrices are only relevant for Ruy's TrMul implementations. For
   // TrMul, the number of sums is always equal to the number of columns.
   return packed.layout.cols * packed.sums_type.size;
 }

 // Transpose helpers.

 inline Order Transpose(Order order) {
   return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
 }

 inline MatLayout Transpose(const MatLayout& layout) {
   MatLayout result(layout);
   result.order = Transpose(result.order);
   std::swap(result.rows, result.cols);
   return result;
 }

 template <typename Scalar>
 Mat<Scalar> Transpose(const Mat<Scalar>& matrix) {
   Mat<Scalar> result(matrix);
   result.layout = Transpose(result.layout);
   return result;
 }

 // Compile-time version of KernelLayout, used to declare kernel layouts in a
 // way that can be consumed by compile-time logic.
 template <Order tOrder, int tRows, int tCols>
 struct FixedKernelLayout {
   static constexpr Order kOrder = tOrder;
   static constexpr int kRows = tRows;
   static constexpr int kCols = tCols;
 };

 template <typename FixedKernelLayout>
 KernelLayout ToKernelLayout() {
   KernelLayout ret;
   ret.order = FixedKernelLayout::kOrder;
   ret.rows = FixedKernelLayout::kRows;
   ret.cols = FixedKernelLayout::kCols;
   return ret;
 }

 #if (__cplusplus < 201703L)
 // A static constexpr data member is automatically inline and should not require
 // redeclaration without an initializer. This is actually deprecated from C++17
 // onwards. Clang with -O0 without this can fail to link.
 template <Order tOrder, int tRows, int tCols>
 constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kCols;
 template <Order tOrder, int tRows, int tCols>
 constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kRows;
 #endif

 }  // namespace ruy

 #endif  // RUY_RUY_INTERNAL_MATRIX_H_
	/* Copyright 2019 Google LLC. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	// Internal types and helpers for matrices.
	// "Mat" is the name we use to refer to our internal matrix classes; it can be
	// thought of as a shorter version of "InternalMatrix"`
	//
	// Ruy has four internal matrix classes, besides the
	// Matrix<T> class that we expose to the user-facing API.
	//
	// TODO(silvasean): Put parts of this architecture description somewhere more
	// prominent.
	//
	// The 4 internal matrix classes are named Mat, EMat, PMat, PEMat, where:
	// - "E" indicates a type-erased class, storing a void* pointer and a runtime
	// enum value to track the scalar type, as opposed to being templatized
	// on a Scalar type and storing a Scalar* pointer.
	// - "P" indicates a packed matrix class, the output of the packing code and
	// input of the kernel code. See comments in pack.h regarding packing.
	//
	// In other words:
	//
	// Plain matrices Packed matrices
	// +----------------------------------
	// Templated \| Mat, Matrix PMat
	// Type-erased \| EMat PEMat
	//
	// Note that Matrix<T> is not implemented in terms of the internal types. It
	// is an independent, simple, and user-facing type. Matrix<T> is functionally
	// equivalent to Mat, but we keep it separate to insulate internals from
	// interface and to be able to make different compromises in internals
	// vs interface: in internals we prefer Mat to be a C-style struct with
	// raw data member access and to be similar to the other PMat/EMat/PEMat
	// classes for consistency.
	//
	// The use of type-erasure might seem surprising for a library like Ruy with a
	// heavily-templated entry point, but it is motivated by the desire for most of
	// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
	// main parts:
	// - "entry-point" (ruy.h) - this is the highly templated ruy::Mul entry
	// point.
	// - "front-end" (frontend., validate., create_trmul_params.*,
	// prepare_packed_matrices.*) - the work to handle the entry-point call down to
	// the point where it can be handed off to the middle/back ends below. That
	// includes routines that select RunKernel and RunPack
	// implementations statically based on those template parameters.
	// - "back-end" (kernel_., pack_.)- this consists of the implementations of
	// RunKernel and RunPack, often in assembly code, which are the building blocks
	// that Ruy calls to perform matrix multiplication. These are templated so that
	// only the requested types/Path's are actually emitted by the compiler.
	// - "middle-end" (trmul.*) - this is the part of Ruy that orchestrates the
	// calls to the "back-end" optimized building blocks. This layer has to deal
	// with issues like cache locality and low-overhead multi-threading.
	//
	// There is a desire for the "middle-end" to be non-templated in order to
	// simplify the implementation and reduce code-size. We type-erase when going
	// from the "front-end" to the "middle-end", and un-type-erase going from the
	// "middle-end" to the "back-end". The un-type-erasure is possible because the
	// "front-end" is responsible for instantiating the needed "back-end" templates,
	// and thus the static type information is still present.
	//
	// Each layer of Ruy uses matrix types:
	// - "entry-point": Matrix<T>
	// - "front-end": Mat
	// - "middle-end": EMat, PEMat
	// - "back-end": Mat, PMat
	//
	// The use of separate types for packed matrices is not essential, but makes it
	// obvious at a glance whether a matrix is a packed matrix or not. We would
	// reconsider this decision if there was significant duplication between packed
	// and unpacked matrices, but that doesn't seem to be the case at the moment.
	//
	// Another goal is to keep the user-facing Matrix<T> as simple and
	// understandable as possible. Ideally, a user should be able to read the struct
	// definition for Matrix<T> and see a very simple definition with no internal
	// details like sums and kernel block layout.

	#ifndef RUY_RUY_INTERNAL_MATRIX_H_
	#define RUY_RUY_INTERNAL_MATRIX_H_

	#include <cstddef>
	#include <cstdint>
	#include <type_traits>
	#include <utility>

	#include "ruy/check_macros.h"
	#include "ruy/matrix.h"
	#include "ruy/size_util.h"

	namespace ruy {

	// Internal counterpart of Layout, used by Mat.
	struct MatLayout final {
	std::int32_t rows = 0;
	std::int32_t cols = 0;
	// Stride is the offset between two adjacent matrix elements
	// in the non-contiguous direction.
	std::int32_t stride = 0;
	Order order = Order::kColMajor;
	};

	inline MatLayout ToInternal(const Layout& src) {
	MatLayout ret;
	ret.rows = src.rows();
	ret.cols = src.cols();
	ret.stride = src.stride();
	ret.order = src.order();
	return ret;
	}

	// Internal counterpart of Matrix
	template <typename Scalar>
	struct Mat final {
	detail::ConstCheckingPtr<Scalar> data;
	MatLayout layout;
	Scalar zero_point = 0;
	CachePolicy cache_policy = CachePolicy::kNeverCache;
	};

	template <typename Scalar>
	inline Mat<Scalar> ToInternal(const Matrix<Scalar>& src) {
	Mat<Scalar> ret;
	ret.data.set(src.data());
	ret.layout = ToInternal(src.layout());
	ret.zero_point = src.zero_point();
	ret.cache_policy = src.cache_policy();
	return ret;
	}

	template <typename Scalar>
	inline Mat<Scalar> ToInternal(Matrix<Scalar>& src) {
	Mat<Scalar> ret;
	ret.data.set(src.data());
	ret.layout = ToInternal(src.layout());
	ret.zero_point = src.zero_point();
	ret.cache_policy = src.cache_policy();
	return ret;
	}

	// KernelLayout describes small-scale block structure in a packed matrix layout.
	// It's a runtime (as opposed to compile-time-constant) version of the
	// FixedKernelLayout struct used to declare kernel layouts.
	//
	// This is is sometimes known as "tiling" in other contexts.
	//
	// For example, consider a packed matrix in column-major format with a
	// column-major KernelLayout. The matrix logically has a shape of
	// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
	// of shape `[cols / kcols, rows / krows, kcols, krows]`.
	//
	// Note that in the case of kcols=1, krows=1, this degenerates to
	// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
	// structure.
	struct KernelLayout final {
	Order order = Order::kColMajor;
	std::uint8_t rows = 1;
	std::uint8_t cols = 1;
	};

	// A packed matrix has a small-scale block structure that is not present in in
	// the input matrices. This block structure is necessary for the kernels to
	// process data efficiently.
	//
	// This struct is very similar to MatLayout, but has the extra KernelLayout
	// field.
	struct PMatLayout final {
	std::int32_t rows = 0;
	std::int32_t cols = 0;
	// Stride is the offset between two adjacent matrix elements
	// in the non-contiguous direction.
	std::int32_t stride = 0;
	Order order = Order::kColMajor;
	// Small scale layout shuffling, potentially departing from
	// linear row-major or column-major storage. See KernelLayout.
	KernelLayout kernel;
	};

	inline bool operator==(const PMatLayout& a, const PMatLayout& b) {
	return a.cols == b.cols && a.rows == b.rows && a.stride == b.stride &&
	a.order == b.order && a.kernel.rows == b.kernel.rows &&
	a.kernel.cols == b.kernel.cols && a.kernel.order == b.kernel.order;
	}

	// Dynamic representation for a type.
	//
	// The most important field in this struct is the size, which Ruy uses to know
	// how much memory to allocate without having to be templated on a type.
	// Signed-ness and floating-point-ness are mainly present as debugging checks.
	//
	// Note: Ruy does not use this struct to to dynamically dispatch between
	// different typed implementations. As described in the comment at the top of
	// this file, Ruy's "front-end", which is templated, instantiates all the
	// necessary "back-end" routines with complete static knowledge of all the
	// types.
	struct Type final {
	template <typename T>
	static Type Create() {
	Type ret;
	ret.is_signed = std::is_signed<T>::value;
	ret.is_floating_point = std::is_floating_point<T>::value;
	ret.size = sizeof(T);
	return ret;
	}

	template <typename T>
	void AssertIs() const {
	RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
	RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
	RUY_DCHECK_EQ(size, Create<T>().size);
	}

	bool is_signed = false;
	bool is_floating_point = false;
	std::uint8_t size = 0;
	};

	inline bool operator==(const Type& type1, const Type& type2) {
	return type1.is_signed == type2.is_signed &&
	type1.is_floating_point == type2.is_floating_point &&
	type1.size == type2.size;
	}

	// Type-erased matrix.
	struct EMat final {
	Type data_type;
	void* data = nullptr;
	MatLayout layout;
	std::int32_t zero_point = 0;
	CachePolicy cache_policy = CachePolicy::kNeverCache;
	};

	// Type-erased packed matrix.
	struct PEMat final {
	Type data_type;
	void* data = nullptr;
	Type sums_type;
	void* sums = nullptr;
	PMatLayout layout;
	std::int32_t zero_point = 0;
	};

	// Convenient typed helper for packed matrices.
	template <typename Scalar>
	struct PMat final {
	// The row/column sums needed for quantized matrix multiplication when
	// the opposite operand of the multiplication uses a non-symmetric zero
	// point.
	// This member is only relevant for packed matrices.
	// Additionally, Ruy always uses 32-bit signed accumulators for quantized
	// matrix multiplication.
	// For floating point types, there is no quantization, so this pointer
	// will always be null. We still need code referencing it to compile
	// though, even if it is always branched around. Hence we use Scalar*
	// itself as the type in that case.
	using SumsType =
	typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
	std::int32_t>::type;

	Scalar* data = nullptr;
	SumsType* sums = nullptr;
	PMatLayout layout;
	std::int32_t zero_point = 0;
	};

	template <typename T>
	EMat EraseType(const Mat<T>& matrix) {
	EMat ret;
	ret.data_type = Type::Create<T>();
	ret.data = const_cast<void>(static_cast<const void>(matrix.data.get()));
	ret.layout = matrix.layout;
	ret.zero_point = matrix.zero_point;
	ret.cache_policy = matrix.cache_policy;
	return ret;
	}

	template <typename T>
	Mat<T> UneraseType(const EMat& matrix) {
	matrix.data_type.AssertIs<T>();
	Mat<T> ret;
	ret.data.set(static_cast<T*>(matrix.data));
	ret.layout = matrix.layout;
	ret.zero_point = matrix.zero_point;
	ret.cache_policy = matrix.cache_policy;
	return ret;
	}

	template <typename T>
	PMat<T> UneraseType(const PEMat& matrix) {
	using SumsType = typename PMat<T>::SumsType;
	matrix.data_type.AssertIs<T>();
	matrix.sums_type.AssertIs<SumsType>();
	PMat<T> ret;
	ret.data = static_cast<T*>(matrix.data);
	ret.sums = static_cast<SumsType*>(matrix.sums);
	ret.layout = matrix.layout;
	ret.zero_point = matrix.zero_point;
	return ret;
	}

	// Helpers for MatLayout / PMatLayout.

	inline bool IsUnstrided(const MatLayout& layout) {
	if (layout.order == Order::kColMajor) {
	return layout.stride == layout.rows;
	} else {
	return layout.stride == layout.cols;
	}
	}

	inline bool IsRowMajor(const MatLayout& layout) {
	return layout.order == Order::kRowMajor;
	}

	inline bool IsColMajor(const MatLayout& layout) {
	return layout.order == Order::kColMajor;
	}

	inline int FlatSize(const MatLayout& layout) {
	const int outerdim =
	layout.order == Order::kColMajor ? layout.cols : layout.rows;
	return layout.stride * outerdim;
	}

	inline bool IsUnstrided(const PMatLayout& layout) {
	if (layout.order == Order::kColMajor) {
	return layout.stride == layout.rows;
	} else {
	return layout.stride == layout.cols;
	}
	}

	inline bool IsRowMajor(const PMatLayout& layout) {
	return layout.order == Order::kRowMajor;
	}

	inline bool IsColMajor(const PMatLayout& layout) {
	return layout.order == Order::kColMajor;
	}

	inline int FlatSize(const PMatLayout& layout) {
	const int outerdim =
	layout.order == Order::kColMajor ? layout.cols : layout.rows;
	return layout.stride * outerdim;
	}

	// TODO(b/130417400) add a unit test
	inline int Offset(const MatLayout& layout, int row, int col) {
	// TODO(benoitjacob) - should check this but this make the _slow tests take
	// 5x longer. Find a mitigation like in Eigen with an 'internal' variant
	// bypassing the check?
	// RUY_DCHECK_GE(row, 0);
	// RUY_DCHECK_GE(col, 0);
	// RUY_DCHECK_LT(row, layout.rows);
	// RUY_DCHECK_LT(col, layout.cols);
	int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
	int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
	return row * row_stride + col * col_stride;
	}

	// TODO(b/130417400) add a unit test
	inline int Offset(const PMatLayout& layout, int row, int col) {
	RUY_DCHECK(is_pot(layout.kernel.rows));
	RUY_DCHECK(is_pot(layout.kernel.cols));
	int row_outer = row & ~(layout.kernel.rows - 1);
	int col_outer = col & ~(layout.kernel.cols - 1);
	int row_stride_outer =
	layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
	int col_stride_outer =
	layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
	int offset_outer =
	row_outer * row_stride_outer + col_outer * col_stride_outer;
	int row_inner = row - row_outer;
	int col_inner = col - col_outer;
	int row_stride_inner =
	layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
	int col_stride_inner =
	layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
	int offset_inner =
	row_inner * row_stride_inner + col_inner * col_stride_inner;
	return offset_outer + offset_inner;
	}

	// Helpers for Mat<T>.

	template <typename Scalar>
	const Scalar* ElementPtr(const Mat<Scalar>& mat, int row, int col) {
	return mat.data.get() + Offset(mat.layout, row, col);
	}

	template <typename Scalar>
	Scalar* ElementPtr(Mat<Scalar>* mat, int row, int col) {
	return mat->data.get() + Offset(mat->layout, row, col);
	}

	template <typename Scalar>
	Scalar Element(const Mat<Scalar>& mat, int row, int col) {
	return *ElementPtr(mat, row, col);
	}

	// Helpers for PMat<T>.
	// Duplicated from Matrix<T>, but the duplication seems acceptable.

	template <typename Scalar>
	const Scalar* ElementPtr(const PMat<Scalar>& mat, int row, int col) {
	return mat.data + Offset(mat.layout, row, col);
	}

	template <typename Scalar>
	Scalar* ElementPtr(PMat<Scalar>* mat, int row, int col) {
	return mat->data + Offset(mat->layout, row, col);
	}

	template <typename Scalar>
	Scalar Element(const PMat<Scalar>& mat, int row, int col) {
	return *ElementPtr(mat, row, col);
	}

	// Helpers for PEMat.

	inline int DataBytes(const PEMat& packed) {
	return FlatSize(packed.layout) * packed.data_type.size;
	}

	inline int SumsBytes(const PEMat& packed) {
	// Packed matrices are only relevant for Ruy's TrMul implementations. For
	// TrMul, the number of sums is always equal to the number of columns.
	return packed.layout.cols * packed.sums_type.size;
	}

	// Transpose helpers.

	inline Order Transpose(Order order) {
	return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
	}

	inline MatLayout Transpose(const MatLayout& layout) {
	MatLayout result(layout);
	result.order = Transpose(result.order);
	std::swap(result.rows, result.cols);
	return result;
	}

	template <typename Scalar>
	Mat<Scalar> Transpose(const Mat<Scalar>& matrix) {
	Mat<Scalar> result(matrix);
	result.layout = Transpose(result.layout);
	return result;
	}

	// Compile-time version of KernelLayout, used to declare kernel layouts in a
	// way that can be consumed by compile-time logic.
	template <Order tOrder, int tRows, int tCols>
	struct FixedKernelLayout {
	static constexpr Order kOrder = tOrder;
	static constexpr int kRows = tRows;
	static constexpr int kCols = tCols;
	};

	template <typename FixedKernelLayout>
	KernelLayout ToKernelLayout() {
	KernelLayout ret;
	ret.order = FixedKernelLayout::kOrder;
	ret.rows = FixedKernelLayout::kRows;
	ret.cols = FixedKernelLayout::kCols;
	return ret;
	}

	#if (__cplusplus < 201703L)
	// A static constexpr data member is automatically inline and should not require
	// redeclaration without an initializer. This is actually deprecated from C++17
	// onwards. Clang with -O0 without this can fail to link.
	template <Order tOrder, int tRows, int tCols>
	constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kCols;
	template <Order tOrder, int tRows, int tCols>
	constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kRows;
	#endif

	} // namespace ruy

	#endif // RUY_RUY_INTERNAL_MATRIX_H_