arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp - platform/external/ComputeLibrary - Git at Google

 /*
  * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #pragma once

 #include <stdio.h>

 #include "gemm_common.hpp"
 #include "profiler.hpp"
 #include "transform.hpp"
 #include "mergeresults.hpp"

 // Some macros used to decide how much working space to allocate.
 // Round allocations up to the next cache line.
 #define ALLOC_ROUND	64
 #define ROUND_UP(x)	((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)

 // Implementation of the GemmCommon abstract class.
 //
 // This implementation interleaves the source matrices in blocks - good for
 // larger matrices.
 template<typename strategy, typename To, typename Tr>
 class GemmInterleaved : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;

     const unsigned int M;
     const unsigned int N;
     const unsigned int K;

     const bool trA;
     const bool trB;

     const strategy strat;

     unsigned int k_block = 0;
     unsigned int x_block = 0;
     unsigned int Mround = 0;

     size_t get_a_working_size() const {
         return ROUND_UP(sizeof(Toi) * k_block * Mround);
     }

     size_t get_b_working_size() const {
         return ROUND_UP(sizeof(Toi) * x_block * k_block);
     }

     size_t get_c_working_size() const {
         return ROUND_UP(sizeof(Tri) * x_block * strat.out_height);
     }

 public:
     size_t get_working_size() const override {
         return get_a_working_size() + get_b_working_size() + get_c_working_size();
     }

     GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) {
         const unsigned int L1_size = ci->L1_size;
         const unsigned int L2_size = ci->L2_size;

         // Work out blocking parameters
         // k_block: Each iteration will consume (out_width + out_height)
         // operands - so how many iterations will fill the L1?
         k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height));

         // Needs to be a multiple of the K unroll level.
         k_block /= strat.k_unroll;
         k_block *= strat.k_unroll;

         // Now tune to presented problem size; this is how many blocks we need.
         int num_k_blocks = (K + (k_block - 1)) / k_block;

         // So divide the space equally into that many blocks.
         k_block = (K + num_k_blocks - 1) / num_k_blocks;

         // And round UP to the K unroll level required.
         k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll;
         k_block *= strat.k_unroll;

         // x_block: Work out how many rows (of length k_block) will fit in the L2
         x_block = L2_size / (sizeof(Toi) * k_block);

         // Needs to be a multiple of the kernel output width.
         x_block /= strat.out_width;
         x_block *= strat.out_width;

         // And tune to the presented problem size.
         int num_x_blocks = (N + (x_block - 1)) / x_block;
         x_block = (N + num_x_blocks - 1) / num_x_blocks;

         x_block = (x_block + strat.out_width - 1) / strat.out_width;
         x_block *= strat.out_width;

         // Work out the rounded size of M - needed for some buffers.
         Mround = (M + (strat.out_height - 1)) / strat.out_height;
         Mround *= strat.out_height;
     }

     // Actually execute the GEMM.
     void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
         profiler prof;

         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
         intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
         size_t diff = 0;

         if (working_space_int & 0xF) {
             diff = 0x10 - (working_space_int & 0xF);
         }

         Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
         Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
         Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);

         for (unsigned int k0=0; k0<K; k0 += k_block) {
             unsigned int kmax = k0 + k_block;
             if (kmax > K) kmax = K;

             // Figure out how many "K" the kernel will actually process.
             int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
             kern_k *= strat.k_unroll;

             prof(PROFILE_PREPA, [&](void) {
                 if (trA ^ strategy::A_transpose) {
                     Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
                 } else {
                     Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax);
                 }
             });

             for (unsigned int x0=0; x0<N; x0 += x_block) {
                 unsigned int xmax = x0 + x_block;
                 if (xmax > N) xmax = N;

                 int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;

                 prof(PROFILE_PREPB, [&](void) {
                     if (trB ^ strategy::B_transpose) {
                         Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
                     } else {
                         Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax);
                     }
                 });

                 for (unsigned int y=0; y<M; y+=strat.out_height) {
                     unsigned int ymax = y + strat.out_height;
                     if (ymax > M) ymax = M;

                     prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
                     prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
                 }
             }
         }
     }
 };
	/*
	* Copyright (c) 2017 ARM Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/
	#pragma once

	#include <stdio.h>

	#include "gemm_common.hpp"
	#include "profiler.hpp"
	#include "transform.hpp"
	#include "mergeresults.hpp"

	// Some macros used to decide how much working space to allocate.
	// Round allocations up to the next cache line.
	#define ALLOC_ROUND 64
	#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)

	// Implementation of the GemmCommon abstract class.
	//
	// This implementation interleaves the source matrices in blocks - good for
	// larger matrices.
	template<typename strategy, typename To, typename Tr>
	class GemmInterleaved : public GemmCommon<To, Tr> {
	typedef typename strategy::operand_type Toi;
	typedef typename strategy::result_type Tri;

	const unsigned int M;
	const unsigned int N;
	const unsigned int K;

	const bool trA;
	const bool trB;

	const strategy strat;

	unsigned int k_block = 0;
	unsigned int x_block = 0;
	unsigned int Mround = 0;

	size_t get_a_working_size() const {
	return ROUND_UP(sizeof(Toi) * k_block * Mround);
	}

	size_t get_b_working_size() const {
	return ROUND_UP(sizeof(Toi) * x_block * k_block);
	}

	size_t get_c_working_size() const {
	return ROUND_UP(sizeof(Tri) * x_block * strat.out_height);
	}

	public:
	size_t get_working_size() const override {
	return get_a_working_size() + get_b_working_size() + get_c_working_size();
	}

	GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) {
	const unsigned int L1_size = ci->L1_size;
	const unsigned int L2_size = ci->L2_size;

	// Work out blocking parameters
	// k_block: Each iteration will consume (out_width + out_height)
	// operands - so how many iterations will fill the L1?
	k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height));

	// Needs to be a multiple of the K unroll level.
	k_block /= strat.k_unroll;
	k_block *= strat.k_unroll;

	// Now tune to presented problem size; this is how many blocks we need.
	int num_k_blocks = (K + (k_block - 1)) / k_block;

	// So divide the space equally into that many blocks.
	k_block = (K + num_k_blocks - 1) / num_k_blocks;

	// And round UP to the K unroll level required.
	k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll;
	k_block *= strat.k_unroll;

	// x_block: Work out how many rows (of length k_block) will fit in the L2
	x_block = L2_size / (sizeof(Toi) * k_block);

	// Needs to be a multiple of the kernel output width.
	x_block /= strat.out_width;
	x_block *= strat.out_width;

	// And tune to the presented problem size.
	int num_x_blocks = (N + (x_block - 1)) / x_block;
	x_block = (N + num_x_blocks - 1) / num_x_blocks;

	x_block = (x_block + strat.out_width - 1) / strat.out_width;
	x_block *= strat.out_width;

	// Work out the rounded size of M - needed for some buffers.
	Mround = (M + (strat.out_height - 1)) / strat.out_height;
	Mround *= strat.out_height;
	}

	// Actually execute the GEMM.
	void execute(const To A, const int lda, const To B, const int ldb, Tr C, const int ldc, const Tr alpha, const Tr beta, void working_space) const override {
	profiler prof;

	int8_t working_space_bytes = reinterpret_cast<int8_t >(working_space);
	intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
	size_t diff = 0;

	if (working_space_int & 0xF) {
	diff = 0x10 - (working_space_int & 0xF);
	}

	Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
	Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
	Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);

	for (unsigned int k0=0; k0<K; k0 += k_block) {
	unsigned int kmax = k0 + k_block;
	if (kmax > K) kmax = K;

	// Figure out how many "K" the kernel will actually process.
	int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
	kern_k *= strat.k_unroll;

	prof(PROFILE_PREPA, [&](void) {
	if (trA ^ strategy::A_transpose) {
	Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
	} else {
	Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax);
	}
	});

	for (unsigned int x0=0; x0<N; x0 += x_block) {
	unsigned int xmax = x0 + x_block;
	if (xmax > N) xmax = N;

	int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;

	prof(PROFILE_PREPB, [&](void) {
	if (trB ^ strategy::B_transpose) {
	Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
	} else {
	Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax);
	}
	});

	for (unsigned int y=0; y<M; y+=strat.out_height) {
	unsigned int ymax = y + strat.out_height;
	if (ymax > M) ymax = M;

	prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
	prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
	}
	}
	}
	}
	};