blob: 52b6d0ffb5501785c605ed802a7f2811fa971fc1 [file] [log] [blame]
#include <immintrin.h>
#include "common_simd.h"
#define CLEAR_AVX() _mm256_zeroupper()
void convolve_5x5_1_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
long i = 0;
long alignedCount = count & 0xFFFFFFF8;
DECLARE_OUTPUT_1()
for (; i < alignedCount; i+=8) {
CONVOLVE_8COLS_XROWS(1, i)
}
}
void convolve_5x5_2_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
long i = 0;
long alignedCount = count & 0xFFFFFFF8;
DECLARE_OUTPUT_2()
for (; i < alignedCount; i+=8) {
CONVOLVE_8COLS_XROWS(2, i)
}
}
void convolve_5x5_4_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
long i = 0;
long alignedCount = count & 0xFFFFFFF8;
DECLARE_OUTPUT_4()
for (; i < alignedCount; i+=8) {
CONVOLVE_8COLS_XROWS(4, i)
}
}
void convolve_5x5_5_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
long i = 0;
long alignedCount = count & 0xFFFFFFF8;
DECLARE_OUTPUT_5()
for (; i < alignedCount; i+=8) {
CONVOLVE_8COLS_XROWS(5, i)
}
}
void convolve_5x5_6_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
long i = 0;
long alignedCount = count & 0xFFFFFFF8;
DECLARE_OUTPUT_6()
for (; i < alignedCount; i+=8) {
CONVOLVE_8COLS_XROWS(6, i)
}
}
void convolve_5x5_7_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
long i = 0;
long alignedCount = count & 0xFFFFFFF8;
DECLARE_OUTPUT_7()
for (; i < alignedCount; i+=8) {
CONVOLVE_8COLS_XROWS(7, i)
}
}
void convolve_5x5_8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
long i = 0;
long alignedCount = count & 0xFFFFFFF8;
DECLARE_OUTPUT_8()
for (; i < alignedCount; i+=8) {
CONVOLVE_8COLS_XROWS(8, i)
}
}
void convolve_5x5_64x64_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
for(int i = 0; i < 60; i+=6)
{
DECLARE_OUTPUT_6()
CONVOLVE_8COLS_XROWS(6, 0)
CONVOLVE_8COLS_XROWS(6, 8)
CONVOLVE_8COLS_XROWS(6, 16)
CONVOLVE_8COLS_XROWS(6, 24)
CONVOLVE_8COLS_XROWS(6, 32)
CONVOLVE_8COLS_XROWS(6, 40)
CONVOLVE_8COLS_XROWS(6, 48)
CONVOLVE_8COLS_XROWS(6, 56)
output += outputStride * 6;
image += inputStride * 6;
}
DECLARE_OUTPUT_4()
CONVOLVE_8COLS_XROWS(4, 0)
CONVOLVE_8COLS_XROWS(4, 8)
CONVOLVE_8COLS_XROWS(4, 16)
CONVOLVE_8COLS_XROWS(4, 24)
CONVOLVE_8COLS_XROWS(4, 32)
CONVOLVE_8COLS_XROWS(4, 40)
CONVOLVE_8COLS_XROWS(4, 48)
CONVOLVE_8COLS_XROWS(4, 56)
}
void convolve_5x5_32x32_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
for(int i = 0; i < 30; i+=6)
{
DECLARE_OUTPUT_6()
CONVOLVE_8COLS_XROWS(6, 0)
CONVOLVE_8COLS_XROWS(6, 8)
CONVOLVE_8COLS_XROWS(6, 16)
CONVOLVE_8COLS_XROWS(6, 24)
output += outputStride * 6;
image += inputStride * 6;
}
DECLARE_OUTPUT_2()
CONVOLVE_8COLS_XROWS(2, 0)
CONVOLVE_8COLS_XROWS(2, 8)
CONVOLVE_8COLS_XROWS(2, 16)
CONVOLVE_8COLS_XROWS(2, 24)
}
void convolve_5x5_16x16_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
for(int i = 0; i < 12; i+=6)
{
DECLARE_OUTPUT_6()
CONVOLVE_8COLS_XROWS(6, 0)
CONVOLVE_8COLS_XROWS(6, 8)
output += outputStride * 6;
image += inputStride * 6;
}
DECLARE_OUTPUT_4()
CONVOLVE_8COLS_XROWS(4, 0)
CONVOLVE_8COLS_XROWS(4, 8)
}
void convolve_5x5_8x8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
DECLARE_OUTPUT_8()
CONVOLVE_8COLS_XROWS(8, 0)
}
void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols) {
long ic = inCols;
long yy = 0;
float* t_ = input;
float* r_ = output;
float* k_ = kernel;
if((outRows == 64) && (outCols == 64)) {
convolve_5x5_64x64_avx(output, input, kernel, outRows, outStride, inCols);
return;
}
if((outRows == 32) && (outCols == 32)) {
convolve_5x5_32x32_avx(output, input, kernel, outRows, outStride, inCols);
return;
}
if((outRows == 16) && (outCols == 16)) {
convolve_5x5_16x16_avx(output, input, kernel, outRows, outStride, inCols);
return;
}
if((outRows == 8) && (outCols == 8)) {
convolve_5x5_8x8_avx(output, input, kernel, outRows, outStride, inCols);
return;
}
for(; yy < (outRows / 6 ) * 6; yy += 6) {
float *pi_ = t_ + yy*ic;
float *pw_ = k_;
float *pis_ = pi_;
convolve_5x5_6_avx(r_, pis_, pw_, outCols, outStride, ic);
r_ += (outStride * 6);
}
// more than 2 rows left to process and we ended up on a non-multiple of 4
if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
// process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
float *pi_ = t_ + yy*ic;
float *pw_ = k_;
float *pis_ = pi_;
convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
r_ += (outStride * 2);
yy += 2;
}
for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
float *pi_ = t_ + yy*ic;
float *pw_ = k_;
float *pis_ = pi_;
convolve_5x5_4_avx(r_, pis_, pw_, outCols, outStride, ic);
r_ += (outStride * 4);
}
for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
float *pi_ = t_ + yy*ic;
float *pw_ = k_;
float *pis_ = pi_;
convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
r_ += (outStride * 2);
}
for(; yy < outRows; yy += 1) {
float *pi_ = t_ + yy*ic;
float *pw_ = k_;
float *pis_ = pi_;
convolve_5x5_1_avx(r_, pis_, pw_, outCols, outStride, ic);
r_ += (outStride * 1);
}
long procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time
long remCols = outCols - procCols;
//process the rest using sse
if( remCols > 0) {
CLEAR_AVX();
convolve_5x5_sse(&output[procCols], &input[procCols], kernel, outRows, remCols, outStride, inCols);
}
}