| #include "ulp.h" |
| |
| #include <cstring> |
| #include "caffe2/operators/conv_pool_op_base.h" |
| #include "caffe2/utils/eigen_utils.h" |
| #include "ulp_neon.h" |
| |
| namespace caffe2 { |
| |
| void uniformQuantize2b1b(const TensorCPU& X, |
| const std::vector<std::unique_ptr<TensorCPU>>& XQ, |
| float offset, |
| float inter_center_distance) { |
| CAFFE_ENFORCE_GT(X.ndim(), 1); |
| const auto N = X.size_to_dim(X.ndim() - 1); |
| auto C = X.size() / N; |
| const auto QC = divRoundUp(C, 8); |
| auto XQs = X.sizes().vec(); |
| XQs[X.ndim() - 1] = QC; |
| CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits); |
| for (auto i = 0; i < k2b1bXBits; ++i) { |
| XQ[i]->Resize(XQs); |
| } |
| const float* Xdata = X.data<float>(); |
| std::array<uint8_t*, k2b1bXBits> XQdata; |
| for (auto i = 0; i < k2b1bXBits; ++i) { |
| XQdata[i] = XQ[i]->mutable_data<uint8_t>(); |
| } |
| for (auto n = 0; n < N; ++n) { |
| for (auto qc = 0; qc < QC; ++qc) { |
| // compute the block in X. |
| std::array<uint8_t, k2b1bXBits> p = {{0, 0}}; |
| for (auto b = 0; b < 8; ++b) { |
| const auto c = qc * 8 + b; |
| if (c < C) { |
| float v = Xdata[qc * 8 + b + C * n]; |
| if (v < offset) { |
| // zero'd already. |
| } else if (v < offset + inter_center_distance) { |
| p[0] |= 1 << b; |
| } else if (v < offset + 2 * inter_center_distance) { |
| p[1] |= 1 << b; |
| } else { |
| p[0] |= 1 << b; |
| p[1] |= 1 << b; |
| } |
| } |
| } |
| for (auto i = 0; i < k2b1bXBits; ++i) { |
| XQdata[i][qc + QC * n] = p[i]; |
| } |
| } |
| } |
| } |
| |
| void qconv(const ConvArgs& args, |
| const TensorCPU& X, |
| const TensorCPU& W, |
| const TensorCPU* b, |
| TensorCPU* Y) { |
| const auto N = X.dim32(0); |
| const auto IH = X.dim32(1); |
| const auto IW = X.dim32(2); |
| const auto KH = W.dim32(1); |
| const auto KW = W.dim32(2); |
| const auto KC = W.dim32(3); |
| Y->Resize(X.dim32(0), |
| (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1, |
| (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1, |
| W.dim32(0)); |
| const auto OH = Y->dim32(1); |
| const auto OW = Y->dim32(2); |
| const auto OC = Y->dim32(3); |
| |
| CAFFE_ENFORCE_EQ(W.dim32(3), X.dim32(3)); |
| |
| const auto* Xdata = X.data<uint8_t>(); |
| const auto* Wdata = W.data<uint8_t>(); |
| auto* Ydata = Y->mutable_data<float>(); |
| for (size_t n = 0; n < N; ++n) { |
| for (size_t oh = 0; oh < OH; ++oh) { |
| for (size_t ow = 0; ow < OW; ++ow) { |
| for (size_t oc = 0; oc < OC; ++oc) { |
| float acc = 0.0; |
| for (size_t kh = 0; kh < KH; ++kh) { |
| const int32_t ih = (int32_t)kh + (int32_t)args.stride_h * oh - (int32_t)args.pad_t; |
| for (size_t kw = 0; kw < KW; ++kw) { |
| const int32_t iw = (int32_t)kw + (int32_t)args.stride_w * ow - (int32_t)args.pad_l; |
| for (size_t kc = 0; kc < KC; ++kc) { |
| const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc]; |
| // Use unsigned integer math to avoid multiple comparisons (>= H, < 0). |
| if ((size_t)ih >= (size_t)IH || (size_t)iw >= (size_t)IW) { |
| acc += __builtin_popcount(0 ^ w); |
| } else { |
| const uint8_t x = |
| Xdata[kc + KC * (size_t)iw + KC * IW * (size_t)ih + n * KC * IW * IH]; |
| const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc]; |
| acc += __builtin_popcount(x ^ w); |
| } |
| } |
| } |
| } |
| Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] = |
| KW * KH * KC * 8 - 2 * acc + (b ? b->data<float>()[oc] : 0.0); |
| ; |
| } |
| } |
| } |
| } |
| } |
| |
| void qpad_zero(const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) { |
| CAFFE_ENFORCE_EQ(args.stride_h, 1); |
| CAFFE_ENFORCE_EQ(args.stride_w, 1); |
| const auto* Xdata = X.data<uint8_t>(); |
| Y->Resize(X.dim32(0), |
| X.dim32(1) + args.pad_t + args.pad_b, |
| X.dim32(2) + args.pad_l + args.pad_r, |
| X.dim32(3)); |
| auto* Ydata = Y->mutable_data<uint8_t>(); |
| ::memset(Ydata, 0, Y->nbytes()); |
| const auto C = Y->dim32(3); |
| const auto XrowSize = X.dim32(3) * X.dim32(2); |
| const auto YrowSize = Y->dim32(3) * Y->dim32(2); |
| math::CopyMatrix<CPUContext>(1, |
| X.dim32(1), |
| XrowSize, |
| Xdata, |
| XrowSize, |
| Ydata + C * args.pad_l + YrowSize * args.pad_t, |
| YrowSize, |
| nullptr); |
| } |
| |
| void signQuantize(const TensorCPU& X, TensorCPU* XQ) { |
| CAFFE_ENFORCE_GT(X.ndim(), 1); |
| const auto N = X.size_to_dim(X.ndim() - 1); |
| auto C = X.size() / N; |
| const auto QC = divRoundUp(C, 8); |
| auto XQs = X.sizes().vec(); |
| XQs[X.ndim() - 1] = QC; |
| XQ->Resize(XQs); |
| const float* Xdata = X.data<float>(); |
| uint8_t* XQdata = XQ->mutable_data<uint8_t>(); |
| for (auto n = 0; n < N; ++n) { |
| for (auto qc = 0; qc < QC; ++qc) { |
| // compute the block in X. |
| uint8_t p = 0; |
| for (auto b = 0; b < 8; ++b) { |
| const auto c = qc * 8 + b; |
| if (c < C) { |
| p |= (Xdata[c + C * n] > 0) << b; |
| } |
| } |
| XQdata[qc + QC * n] = p; |
| } |
| } |
| } |
| |
| void filterNormalization11(const TensorCPU& WQ, TensorCPU* WQN) { |
| const auto F = WQ.dim32(0); |
| // In our NEON kernel we read up to TileSize, so align allocation to TileSize elements. |
| WQN->Resize(divRoundUp(F, kGEMMTileSize) * kGEMMTileSize); |
| const auto WQs = WQ.size() / F; |
| const auto WQbits = 8 * WQs; |
| const auto* WQdata = WQ.data<uint8_t>(); |
| auto* WQNdata = WQN->mutable_data<float>(); |
| for (auto f = 0; f < F; ++f) { |
| int32_t bitSum = 0; |
| for (auto j = 0; j < WQs; ++j) { |
| bitSum += __builtin_popcount(WQdata[f * WQs + j]); |
| } |
| DCHECK_LE(bitSum, WQbits); |
| WQNdata[f] = 2 * bitSum - WQbits; |
| } |
| } |
| |
| void filterNormalizationL1(const TensorCPU& W, TensorCPU* WL1) { |
| const auto F = W.dim32(0); |
| WL1->Resize(F); |
| const auto Ws = W.size() / F; |
| const auto* Wdata = W.data<float>(); |
| auto* WL1data = WL1->mutable_data<float>(); |
| for (auto f = 0; f < F; ++f) { |
| double l1sum = 0.0; |
| for (auto j = 0; j < Ws; ++j) { |
| l1sum += std::abs(Wdata[f * Ws + j]); |
| } |
| WL1data[f] = l1sum / Ws; |
| } |
| } |
| |
| void qim2col(const ConvArgs& args, const TensorCPU& XQ, const TensorCPU& WQ, TensorCPU* XQcol) { |
| // TODO: pass pre-resized output? |
| // TODO: handle strides? |
| |
| CAFFE_ENFORCE_EQ(XQ.dim32(3), WQ.dim32(3)); |
| const size_t N = XQ.dim32(0); |
| const size_t IH = XQ.dim32(1); |
| const size_t IW = XQ.dim32(2); |
| const size_t KH = WQ.dim32(1); |
| const size_t KW = WQ.dim32(2); |
| const size_t KC = WQ.dim32(3); |
| |
| XQcol->Resize(XQ.dim32(0), |
| (XQ.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1, |
| (XQ.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1, |
| KH * KW * KC); |
| |
| if (args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 && args.pad_t == 0 && |
| args.stride_h == 1 && args.stride_w == 1 && KH == 1 && KW == 1) { |
| CAFFE_ENFORCE_EQ(XQ.size(), XQcol->size()); |
| XQcol->ShareExternalPointer(const_cast<uint8_t*>(XQ.data<uint8_t>()), XQ.size()); |
| return; |
| } |
| const size_t OH = XQcol->dim32(1); |
| const size_t OW = XQcol->dim32(2); |
| |
| const uint8_t* XQdata = XQ.data<uint8_t>(); |
| uint8_t* XQcoldata = XQcol->mutable_data<uint8_t>(); |
| for (size_t n = 0; n < N; ++n) { |
| for (size_t oh = 0; oh < OH; ++oh) { |
| int32_t h_pad = (int32_t)(args.stride_h * oh) - (int32_t)args.pad_t; |
| for (size_t ow = 0; ow < OW; ++ow) { |
| int32_t w_pad = (int32_t)(args.stride_w * ow) - (int32_t)args.pad_l; |
| for (size_t kh = 0; kh < KH; ++kh) { |
| int32_t ih = (int32_t)kh + h_pad; |
| if ((size_t)ih < (size_t)IH && (size_t)w_pad < (size_t)IW && |
| (size_t)((int32_t)w_pad + (int32_t)KW) < (size_t)IW) { |
| // We can do a larger memcpy, of size KW * KC |
| size_t off = kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW + |
| n * KH * KW * KC * OW * OH; |
| std::memcpy(&XQcoldata[off], |
| &XQdata[((int32_t)w_pad) * KC + ih * IW * KC + n * IW * KC * IH], |
| KW * KC); |
| } else { |
| for (size_t kw = 0; kw < KW; ++kw) { |
| int32_t iw = (int32_t)kw + w_pad; |
| // Use unsigned integer math to avoid multiple comparisons (>= H, < 0). |
| size_t off = kw * KC + kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW + |
| n * KH * KW * KC * OW * OH; |
| if ((size_t)ih < (size_t)IH && (size_t)iw < (size_t)IW) { |
| std::memcpy( |
| &XQcoldata[off], &XQdata[iw * KC + ih * IW * KC + n * KC * IW * IH], KC); |
| } else { |
| // This should be simply padded with zero. |
| std::memset(&XQcoldata[off], 0, KC); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws, |
| const TensorCPU& W, |
| const TensorCPU* b) { |
| auto state = std::make_unique<QConvState>(); |
| state->XQs.resize(k2b1bXBits); |
| state->YQs.resize(k2b1bXBits); |
| for (auto i = 0; i < k2b1bXBits; ++i) { |
| state->XQs[i] = std::make_unique<Tensor>(CPU); |
| state->YQs[i] = std::make_unique<Tensor>(CPU); |
| } |
| state->WQ = std::make_unique<Tensor>(CPU); |
| state->WQN = std::make_unique<Tensor>(CPU); |
| state->WQL1Norm = std::make_unique<Tensor>(CPU); |
| state->scratch = std::make_unique<Tensor>(CPU); |
| state->scratchColBuffer = std::make_unique<Tensor>(CPU); |
| |
| signQuantize(W, state->WQ.get()); |
| filterNormalization11(*(state->WQ), state->WQN.get()); |
| filterNormalizationL1(W, state->WQL1Norm.get()); |
| // TODO: incorporate center distance normalization. |
| // Since inputs to convs are [0, 1, 2, 3], instead of [0, x, 2 * x, ...], |
| // we can just uniformly rescale the outputs by x, i.e., |
| // for (auto i = 0; i < r->WQL1Norm.size(); ++i) { |
| // r->WQL1Norm.mutable_data<float>()[i] *= center_distance; |
| // } |
| state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) { |
| #ifdef C10_MOBILE |
| ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range); |
| #else |
| for (size_t v = 0; v < range; ++v) { |
| f(v); |
| } |
| #endif |
| }; |
| if (b) { |
| state->bias = std::make_unique<Tensor>(*b, CPU); |
| } |
| return state; |
| } |
| |
| void run2b1bConvGeneric(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) { |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| if (run2b1bConvNeon(state, args, X, Y)) { |
| return; |
| } |
| #endif |
| uniformQuantize2b1b(X, state->XQs, 0.5, 1.0); |
| for (auto i = 0; i < k2b1bXBits; ++i) { |
| qconv(args, *(state->XQs[i]), *(state->WQ), nullptr, state->YQs[i].get()); |
| } |
| Y->ResizeLike(*(state->YQs[0])); |
| const auto F = state->WQ->dim(0); |
| const auto N = Y->size() / F; |
| run2b1bUnification(state, |
| N, |
| F, |
| state->WQN->data<float>(), |
| state->YQs[0]->data<float>(), |
| state->YQs[1]->data<float>(), |
| F, |
| Y->mutable_data<float>(), |
| F, |
| state->bias ? state->bias->data<float>() : nullptr); |
| } |
| |
| void run2b1bUnification(QConvState* state, |
| size_t N, |
| size_t C, |
| const float* WQNVdata, |
| const float* YQs0Vdata, |
| const float* YQs1Vdata, |
| size_t YQstride, |
| float* Ydata, |
| size_t Ystride, |
| const float* bias) { |
| ConstEigenVectorArrayMap<float> WQNV(WQNVdata, C); |
| |
| for (size_t j = 0; j < N; ++j) { |
| ConstEigenVectorArrayMap<float> YQs0V(YQs0Vdata + YQstride * j, C); |
| ConstEigenVectorArrayMap<float> YQs1V(YQs1Vdata + YQstride * j, C); |
| EigenVectorArrayMap<float> YNV(Ydata + Ystride * j, C); |
| if (bias) { |
| ConstEigenVectorArrayMap<float> BV(bias, C); |
| YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V + |
| std::pow<float>(2, 0) * YQs1V + BV; |
| } else { |
| YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V + |
| std::pow<float>(2, 0) * YQs1V; |
| } |
| } |
| } |
| |
| class QConvOp final : public ConvPoolOpBase<CPUContext> { |
| public: |
| QConvOp(const OperatorDef& operator_def, Workspace* ws) |
| : ConvPoolOpBase<CPUContext>(operator_def, ws), ws_(ws) { |
| OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NHWC, "QConvOp only supports NHWC order"); |
| OPERATOR_NEEDS_FEATURE(this->dilation_h() == 1, ""); |
| OPERATOR_NEEDS_FEATURE(this->dilation_w() == 1, ""); |
| OPERATOR_NEEDS_FEATURE(this->group_ == 1, ""); |
| } |
| |
| bool RunOnDeviceWithOrderNHWC() override { |
| auto& X = Input(0); |
| auto& filter = Input(1); |
| const auto* bias = InputSize() == 3 ? &Input(2) : nullptr; |
| auto* Y = Output(0); |
| |
| // TODO: Support multiple quantization methods instead of assuming 2b1b. |
| if (!state_) { |
| state_ = create2b1bConvState(ws_, filter, bias); |
| } |
| ConvArgs args; |
| args.pad_l = this->pad_l(); |
| args.pad_t = this->pad_t(); |
| args.pad_b = this->pad_b(); |
| args.pad_r = this->pad_r(); |
| args.stride_h = this->stride_h(); |
| args.stride_w = this->stride_w(); |
| run2b1bConvGeneric(state_.get(), args, X, Y); |
| return true; |
| } |
| |
| private: |
| std::unique_ptr<QConvState> state_; |
| Workspace* ws_; |
| }; |
| |
| REGISTER_CPU_OPERATOR(QConv, QConvOp); |
| |
| } // namespace caffe2 |