blob: c71c5ac0f2fe8226d230fd9f430faed48728c1df [file] [log] [blame]
// Implements the math functions for CPU.
// The implementation in this file allows us to route the underlying numerical
// computation library to different compiler options (-mno-avx2 or -mavx2).
#include <algorithm>
#include <array>
#include <atomic>
#include <chrono>
#include <cmath>
#include <cstring>
#include <functional>
#include <limits>
#include <numeric>
#include <random>
#include <tuple>
#include <unordered_set>
#include <vector>
#include "caffe2/core/context.h"
#include "caffe2/perfkernels/math.h"
#include "caffe2/utils/cpu_neon.h"
#include "caffe2/utils/cpuid.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
#include "Eigen/Core"
#include "Eigen/Dense"
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL
#ifdef CAFFE2_USE_HPTT
#include <hptt.h>
#endif // CAFFE2_USE_HPTT
#if defined(_MSC_VER)
#include <process.h>
#endif
namespace caffe2 {
namespace math {
#define QEPSILON 1e-8
void quantize_and_compress__avx2(
const float* input_data,
uint8_t* output_data,
size_t input_size,
size_t bitwidth,
bool random,
#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
VSLStreamStatePtr& vslStream,
std::vector<float>& random_buffer
#else
std::unique_ptr<std::uniform_real_distribution<float>>& dis,
std::minstd_rand& gen
#endif
) {
CAFFE_ENFORCE(
bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
"Unsupported bitwidth");
#ifdef __AVX2__
__m256i shuffle_mask_v = _mm256_set_epi8(
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0x0c,
0x08,
0x04,
0x00,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0xff,
0x0c,
0x08,
0x04,
0x00);
__m256i permute_mask_v =
_mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
#endif // __AVX2__
// memory pointers
ConstEigenVectorArrayMap<float> input_row(input_data, input_size);
uint8_t* output_row = output_data;
EigenVectorArrayMap<uint8_t> output_bitwidth_tail(output_row, 2);
EigenVectorArrayMap<float> output_row_min_max(
reinterpret_cast<float*>(output_row + 2), 2);
size_t data_per_byte = 8 / bitwidth;
size_t tail = input_size % data_per_byte;
tail = tail ? data_per_byte - tail : 0;
size_t segment_size = (input_size + data_per_byte - 1) / data_per_byte;
// basic info
const float minimum_element = input_row.minCoeff();
const float maximum_element = input_row.maxCoeff();
output_bitwidth_tail(0) = bitwidth;
output_bitwidth_tail(1) = tail;
output_row_min_max(0) = minimum_element;
output_row_min_max(1) = maximum_element;
float gap = (maximum_element - minimum_element) / ((1 << bitwidth) - 1.0f);
float gap_inverse = 1. / (gap + QEPSILON);
uint8_t max_q = (1 << bitwidth) - 1;
size_t bit_start = 0;
if (random) {
#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
int status = vsRngUniform(
VSL_RNG_METHOD_UNIFORM_STD,
vslStream,
input_size,
random_buffer.data(),
0.0f,
1.0f);
if (status != VSL_ERROR_OK) {
LOG(WARNING) << "vsRngUniform returns " << status;
}
#endif
for (int start = 0; start < input_size; start += segment_size) {
size_t stride = start + segment_size <= input_size ? segment_size
: input_size - start;
int i = 0;
#ifdef __AVX2__
constexpr int VLEN = 8;
for (; i < stride / VLEN * VLEN; i += VLEN) {
#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
__m256 r_v = _mm256_loadu_ps(&random_buffer[start + i]);
#else
float random_buffer[VLEN];
for (int j = 0; j < VLEN; ++j) {
random_buffer[j] = (*dis)(gen);
}
__m256 r_v = _mm256_loadu_ps(random_buffer);
#endif
__m256 fval_v = _mm256_loadu_ps(input_data + start + i);
__m256 thetimes_v = _mm256_mul_ps(
_mm256_sub_ps(fval_v, _mm256_set1_ps(minimum_element)),
_mm256_set1_ps(gap_inverse));
__m256 rounded_v = _mm256_floor_ps(_mm256_add_ps(thetimes_v, r_v));
rounded_v = _mm256_max_ps(
_mm256_setzero_ps(),
_mm256_min_ps(_mm256_set1_ps(max_q), rounded_v));
__m256i qval_v = _mm256_cvtps_epi32(rounded_v);
__m256i orval_v = _mm256_cvtepu8_epi32(_mm_lddqu_si128(
reinterpret_cast<const __m128i*>(output_row + 10 + i)));
orval_v =
_mm256_or_si256(orval_v, _mm256_slli_epi32(qval_v, bit_start));
orval_v = _mm256_shuffle_epi8(orval_v, shuffle_mask_v);
orval_v = _mm256_permutevar8x32_epi32(orval_v, permute_mask_v);
*reinterpret_cast<int64_t*>(output_row + 10 + i) =
_mm256_extract_epi64(orval_v, 0);
}
#endif // __AVX2__
for (; i < stride; ++i) {
float fval = input_data[start + i];
float thetimes = (fval - minimum_element) * gap_inverse;
#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
float rounded = floor(thetimes + random_buffer[start + i]);
#else
float rounded = floor(thetimes + (*dis)(gen));
#endif
rounded = std::max(0.0f, std::min(static_cast<float>(max_q), rounded));
uint8_t qval = rounded;
uint8_t orval = output_row[10 + i];
output_row[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
}
bit_start += bitwidth;
}
} else {
for (int start = 0; start < input_size; start += segment_size) {
size_t stride = start + segment_size <= input_size ? segment_size
: input_size - start;
int i = 0;
#ifdef __AVX2__
constexpr int VLEN = 8;
for (; i < stride / VLEN * VLEN; i += VLEN) {
__m256 fval_v = _mm256_loadu_ps(input_data + start + i);
__m256 thetimes_v = _mm256_mul_ps(
_mm256_sub_ps(fval_v, _mm256_set1_ps(minimum_element)),
_mm256_set1_ps(gap_inverse));
thetimes_v = _mm256_max_ps(
_mm256_setzero_ps(),
_mm256_min_ps(_mm256_set1_ps(max_q), thetimes_v));
__m256i qval_v = _mm256_cvtps_epi32(_mm256_round_ps(
thetimes_v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
__m256i orval_v = _mm256_cvtepu8_epi32(_mm_lddqu_si128(
reinterpret_cast<const __m128i*>(output_row + 10 + i)));
orval_v =
_mm256_or_si256(orval_v, _mm256_slli_epi32(qval_v, bit_start));
orval_v = _mm256_shuffle_epi8(orval_v, shuffle_mask_v);
orval_v = _mm256_permutevar8x32_epi32(orval_v, permute_mask_v);
*reinterpret_cast<int64_t*>(output_row + 10 + i) =
_mm256_extract_epi64(orval_v, 0);
}
#endif // __AVX2__
for (; i < stride; ++i) {
float fval = input_data[start + i];
float thetimes = (fval - minimum_element) * gap_inverse;
thetimes =
std::max(0.0f, std::min(static_cast<float>(max_q), thetimes));
uint8_t qval = nearbyint(thetimes);
uint8_t orval = output_row[10 + i];
output_row[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
}
bit_start += bitwidth;
}
}
}
void decompress_and_dequantize__avx2(
const uint8_t* input_data,
float* output_data,
size_t input_size) {
ConstEigenVectorArrayMap<float> input_row_min_max(
reinterpret_cast<const float*>(input_data + 2), 2);
// basic info
const float minimum_element = input_row_min_max(0);
const float maximum_element = input_row_min_max(1);
const size_t bitwidth = input_data[0];
const float gap =
(maximum_element - minimum_element) / ((1 << bitwidth) - 1.f) +
QEPSILON; // for exact recovering
CAFFE_ENFORCE(
bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
"Unsupported bitwidth");
const size_t tail = input_data[1];
const size_t output_size = (input_size - 10) * (8 / bitwidth) - tail;
EigenVectorArrayMap<float> output_row(output_data, output_size);
// decoding
size_t bit_start = 0;
const size_t segment_size = input_size - 10;
for (int start = 0; start < output_size; start += segment_size) {
size_t stride = start + segment_size <= output_size ? segment_size
: output_size - start;
uint8_t mask = (1 << bitwidth) - 1;
int i = 0;
#ifdef __AVX2__
// Can process 8 elements at a time because we need to expand uint8_t
// to int32_t to use epi32 vector instructions.
constexpr int VLEN = 8;
for (; i < stride / VLEN * VLEN; i += VLEN) {
__m128i in_v = _mm_lddqu_si128(
reinterpret_cast<const __m128i*>(input_data + 10 + i));
__m256i out_epi32_v = _mm256_and_si256(
_mm256_srli_epi32(_mm256_cvtepu8_epi32(in_v), bit_start),
_mm256_set1_epi32(mask));
_mm256_storeu_ps(
output_data + start + i, _mm256_cvtepi32_ps(out_epi32_v));
}
#endif
for (; i < stride; ++i) {
output_data[start + i] = ((input_data[10 + i] >> bit_start) & mask);
}
bit_start += bitwidth;
}
// scaling and biasing
output_row = output_row * gap + minimum_element;
}
#undef QEPSILON
} // namespace math
} // namespace caffe2