blob: 9bd308c5a5804ad25a777fab2ce0e1738675111e [file] [log] [blame]
#include "ulp.h"
#include "ulp_neon.h"
#include "gtest/gtest.h"
namespace caffe2 {
void conv(const ConvArgs& args,
const TensorCPU& X,
const TensorCPU& W,
const TensorCPU* b,
TensorCPU* Y) {
const auto N = X.dim32(0);
const auto IH = X.dim32(1);
const auto IW = X.dim32(2);
const auto KH = W.dim32(1);
const auto KW = W.dim32(2);
const auto IC = W.dim32(3);
Y->Resize(X.dim32(0),
(X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
(X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
W.dim32(0));
TORCH_CHECK_EQ(W.dim32(3), X.dim32(3));
const auto OH = Y->dim32(1);
const auto OW = Y->dim32(2);
const auto OC = Y->dim32(3);
const auto* Xdata = X.data<float>();
const auto* Wdata = W.data<float>();
auto* Ydata = Y->mutable_data<float>();
for (auto n = 0; n < N; ++n) {
for (auto oh = 0; oh < OH; ++oh) {
for (auto ow = 0; ow < OW; ++ow) {
for (auto oc = 0; oc < OC; ++oc) {
float acc = b ? b->data<float>()[oc] : 0.0;
for (int kh = 0; kh < KH; ++kh) {
for (int kw = 0; kw < KW; ++kw) {
for (int ic = 0; ic < IC; ++ic) {
if (kh + args.stride_h * oh - args.pad_t < 0 ||
kh + args.stride_h * oh - args.pad_t >= IH ||
kw + args.stride_w * ow - args.pad_l < 0 ||
kw + args.stride_w * ow - args.pad_l >= IW) {
continue;
}
const auto x =
Xdata[ic + IC * (kw + args.stride_w * ow - args.pad_l) +
IC * IW * (kh + args.stride_h * oh - args.pad_t) + n * IC * IW * IH];
const auto w = Wdata[ic + IC * kw + IC * KW * kh + IC * KW * KH * oc];
acc += x * w;
}
}
}
Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] = acc;
}
}
}
}
}
int randInt(int a, int b) {
std::random_device rd;
std::default_random_engine gen(rd());
return std::uniform_int_distribution<int>(a, b)(gen);
}
TensorCPU genTensor11(std::vector<int64_t> shape) {
Tensor r(CPU);
r.Resize(shape);
std::random_device rd;
std::default_random_engine gen(rd());
std::uniform_real_distribution<float> dis(0, 1);
for (auto i = 0; i < r.size(); ++i) {
r.mutable_data<float>()[i] = dis(gen) > 0.5 ? -1.0 : 1.0;
};
return r;
}
TensorCPU genTensorUniform11(std::vector<int64_t> shape) {
Tensor r(CPU);
r.Resize(shape);
std::random_device rd;
std::default_random_engine gen(rd());
std::uniform_real_distribution<float> dis(-5.0, 5.0);
for (auto i = 0; i < r.size(); ++i) {
r.mutable_data<float>()[i] = dis(gen);
};
return r;
}
TensorCPU genTensor0123(std::vector<int64_t> shape) {
Tensor r(CPU);
r.Resize(shape);
std::random_device rd;
std::default_random_engine gen(rd());
std::uniform_real_distribution<float> dis(0.1, 3.9);
for (auto i = 0; i < r.size(); ++i) {
r.mutable_data<float>()[i] = std::floor(dis(gen));
};
return r;
}
TEST(ULP, QPadZero) {
ConvArgs args;
args.pad_l = 1;
args.pad_r = 1;
args.pad_t = 1;
args.pad_b = 1;
const auto ICQ = 1;
auto X = genTensor11({1, 10, 10, ICQ * 8});
Tensor XQ(CPU), XQPad(CPU);
signQuantize(X, &XQ);
qpad_zero(args, XQ, &XQPad);
EXPECT_EQ(XQ.dim32(0), XQPad.dim32(0));
EXPECT_EQ(XQ.dim32(1), XQPad.dim32(1) - 2 * args.pad_l);
EXPECT_EQ(XQ.dim32(2), XQPad.dim32(2) - 2 * args.pad_t);
EXPECT_EQ(XQ.dim32(3), XQPad.dim32(3));
EXPECT_EQ(XQ.dim32(3), ICQ);
EXPECT_EQ(XQPad.dim32(3), ICQ);
const auto* XQdata = XQ.data<uint8_t>();
const auto* XQPaddata = XQPad.data<uint8_t>();
for (auto oh = 0; oh < XQPad.dim32(1); ++oh) {
for (auto ow = 0; ow < XQPad.dim32(2); ++ow) {
for (auto icq = 0; icq < ICQ; ++icq) {
auto ih = oh - args.pad_l;
auto iw = ow - args.pad_t;
if (ih < 0 || ih >= XQ.dim32(1) || iw < 0 || iw >= XQ.dim32(2)) {
EXPECT_EQ(XQPaddata[icq + ICQ * ow + ICQ * XQPad.dim32(2) * oh], 0);
} else {
EXPECT_EQ(XQPaddata[icq + ICQ * ow + ICQ * XQPad.dim32(2) * oh],
XQdata[icq + ICQ * iw + ICQ * XQ.dim32(2) * ih]);
}
}
}
}
}
inline void gemmNT(int M, int N, int K, const float* A, const float* B, float* C) {
for (auto m = 0; m < M; ++m) {
for (auto n = 0; n < N; ++n) {
float acc = 0.0;
for (auto k = 0; k < K; ++k) {
acc += A[m * K + k] * B[n * K + k];
}
C[m * N + n] = acc;
}
}
}
inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, float* C) {
TORCH_CHECK_EQ(K % 8, 0);
const int QK = K / 8;
for (auto m = 0; m < M; ++m) {
for (auto n = 0; n < N; ++n) {
float acc = 0.0;
for (auto qk = 0; qk < QK; ++qk) {
uint8_t mk = A[m * QK + qk];
uint8_t nk = B[n * QK + qk];
auto cnt = __builtin_popcount(mk ^ nk);
acc += cnt;
}
C[m * N + n] = K - 2 * acc;
}
}
}
void gemmTest(int64_t M, int64_t N, int64_t K) {
auto X = genTensor11({M, K});
auto W = genTensor11({N, K});
Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
{
signQuantize(X, &XQ);
signQuantize(W, &WQ);
YQ.Resize(M, N);
qgemmNT(M, N, K, XQ.data<uint8_t>(), WQ.data<uint8_t>(), YQ.mutable_data<float>());
}
{
Y.Resize(M, N);
gemmNT(M, N, K, X.data<float>(), W.data<float>(), Y.mutable_data<float>());
}
EXPECT_TRUE(Y.sizes() == YQ.sizes());
for (auto i = 0; i < Y.size(); ++i) {
EXPECT_NEAR(Y.data<float>()[i], YQ.data<float>()[i], 1e-3);
}
}
TEST(QConv, GemmTest) {
gemmTest(8, 64, 64);
gemmTest(16, 64, 256);
gemmTest(24, 128, 192);
gemmTest(32, 64, 64);
gemmTest(40, 64, 128);
gemmTest(64, 64, 256);
}
TEST(QConv, ConvTest) {
int S = 9;
int IC = 16;
int OC = 28;
int K = 3;
auto X = genTensor11({1, S, S, IC});
auto W = genTensor11({OC, K, K, IC});
Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
{
signQuantize(X, &XQ);
signQuantize(W, &WQ);
qconv(ConvArgs{}, XQ, WQ, nullptr, &YQ);
}
{ conv(ConvArgs{}, X, W, nullptr, &Y); }
EXPECT_TRUE(Y.sizes() == YQ.sizes());
for (auto i = 0; i < Y.size(); ++i) {
EXPECT_NEAR(Y.data<float>()[i], YQ.data<float>()[i], 1e-3);
}
}
void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs args) {
args.stride_h = std::min(args.stride_h, KH);
args.stride_w = std::min(args.stride_w, KW);
args.pad_l = std::min(args.pad_l, KW - 1);
args.pad_r = std::min(args.pad_r, KW - 1);
args.pad_t = std::min(args.pad_t, KH - 1);
args.pad_b = std::min(args.pad_b, KH - 1);
LOG(INFO) << "IC: " << IC << ", KH: " << KH << ", KW: " << KW << ", H: " << H << ", W: " << W
<< ", OC: " << OC << ", N: " << N << ", pad_l: " << args.pad_l
<< ", pad_r: " << args.pad_r << ", pad_t: " << args.pad_t << ", pad_b: " << args.pad_b
<< ", stride_h: " << args.stride_h << ", stride_w: " << args.stride_w;
auto X = genTensor0123({N, H, W, IC});
auto W_ = genTensor11({OC, KH, KW, IC});
auto bias = genTensorUniform11({OC});
Tensor Y(CPU), YQ(CPU), Y2b1b(CPU), YOP(CPU);
{
std::vector<std::unique_ptr<TensorCPU>> XQs(k2b1bXBits);
std::vector<std::unique_ptr<TensorCPU>> YQs(k2b1bXBits);
for (auto i = 0; i < k2b1bXBits; ++i) {
XQs[i] = std::make_unique<Tensor>(CPU);
YQs[i] = std::make_unique<Tensor>(CPU);
}
Tensor WQN(CPU), WQ(CPU);
uniformQuantize2b1b(X, XQs, 0.5, 1.0);
signQuantize(W_, &WQ);
filterNormalization11(WQ, &WQN);
for (auto i = 0; i < XQs.size(); ++i) {
qconv(args, *(XQs[i]), WQ, nullptr, YQs[i].get());
}
YQ.ResizeLike(*YQs[0]);
const auto F = WQ.dim(0);
const auto N = YQ.size() / F;
run2b1bUnification(nullptr,
N,
F,
WQN.data<float>(),
YQs[0]->data<float>(),
YQs[1]->data<float>(),
F,
YQ.mutable_data<float>(),
F,
bias.data<float>());
}
{
Workspace ws;
auto state = create2b1bConvState(&ws, W_, &bias);
run2b1bConvGeneric(state.get(), args, X, &Y2b1b);
}
{
Workspace ws;
OperatorDef def;
def.set_type("QConv");
def.add_input("X");
def.add_input("W");
def.add_input("b");
def.add_output("Y");
def.add_arg()->CopyFrom(MakeArgument("kernel_h", KH));
def.add_arg()->CopyFrom(MakeArgument("order", std::string("NHWC")));
def.add_arg()->CopyFrom(MakeArgument("kernel_w", KW));
def.add_arg()->CopyFrom(MakeArgument("stride_h", args.stride_h));
def.add_arg()->CopyFrom(MakeArgument("stride_w", args.stride_w));
def.add_arg()->CopyFrom(MakeArgument("pad_l", args.pad_l));
def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
auto* Xws = BlobGetMutableTensor(ws.CreateBlob("X"), CPU);
Xws->ResizeLike(X);
Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
auto* Wws = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
Wws->ResizeLike(W_);
Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
auto* bws = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
bws->ResizeLike(bias);
bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
ws.RunOperatorOnce(def);
YOP.CopyFrom(ws.GetBlob("Y")->Get<TensorCPU>());
}
{ conv(args, X, W_, &bias, &Y); }
EXPECT_TRUE(Y.sizes() == YQ.sizes());
EXPECT_TRUE(Y.sizes() == Y2b1b.sizes());
EXPECT_TRUE(Y.sizes() == YOP.sizes());
// for (auto i = 0; i < Y.size(); ++i) {
// LOG(INFO) << "i: " << i << ", y[i]: " << Y.data<float>()[i]
// << ", y2b1b[i]: " << Y2b1b.data<float>()[i] << ", yq[i]: " << YQ.data<float>()[i];
// }
for (auto i = 0; i < Y.size(); ++i) {
EXPECT_NEAR(Y.data<float>()[i], YQ.data<float>()[i], 1e-3);
}
for (auto i = 0; i < Y.size(); ++i) {
EXPECT_NEAR(Y.data<float>()[i], Y2b1b.data<float>()[i], 1e-3);
}
for (auto i = 0; i < Y.size(); ++i) {
EXPECT_NEAR(Y.data<float>()[i], YOP.data<float>()[i], 1e-3);
}
}
ConvArgs ca(size_t pad = 0, size_t stride = 1) {
ConvArgs r;
r.pad_l = pad;
r.pad_r = pad;
r.pad_t = pad;
r.pad_b = pad;
r.stride_w = stride;
r.stride_h = stride;
return r;
}
TEST(QConv, 2b1bConvTest) {
ConvTest2b1b(40, 3, 4, 10, 10, 32, 1, ca());
ConvTest2b1b(59, 1, 1, 1, 1, 1, 1, ca());
ConvTest2b1b(59, 2, 2, 3, 3, 1, 1, ca());
ConvTest2b1b(59, 2, 2, 3, 3, 64, 1, ca());
ConvTest2b1b(64, 1, 1, 1, 1, 1, 1, ca());
ConvTest2b1b(64, 1, 1, 1, 1, 64, 1, ca());
ConvTest2b1b(64, 2, 2, 3, 3, 1, 1, ca());
ConvTest2b1b(64, 1, 1, 3, 3, 1, 1, ca());
ConvTest2b1b(128, 1, 1, 1, 1, 128, 1, ca());
ConvTest2b1b(128, 1, 1, 8, 8, 8, 1, ca());
ConvTest2b1b(128, 3, 3, 25, 100, 16, 1, ca());
ConvTest2b1b(64, 3, 3, 10, 10, 8, 1, ca());
ConvTest2b1b(128, 1, 3, 10, 10, 16, 1, ca());
ConvTest2b1b(256, 3, 3, 14, 17, 128, 1, ca());
ConvTest2b1b(512, 3, 3, 3, 3, 3, 1, ca());
ConvTest2b1b(64, 5, 5, 14, 17, 15, 1, ca(1, 2));
ConvTest2b1b(64, 1, 3, 14, 17, 32, 1, ca());
ConvTest2b1b(64, 2, 1, 14, 17, 7, 1, ca());
ConvTest2b1b(128, 1, 1, 14, 17, 128, 1, ca());
ConvTest2b1b(128, 1, 1, 14, 17, 32, 1, ca());
}
TEST(QConv, 2b1bInputPackingTest) {
ConvTest2b1b(64, 1, 1, 1, 1, 128, 1, ca());
ConvTest2b1b(8, 1, 1, 1, 1, 1, 1, ca());
ConvTest2b1b(2, 1, 1, 1, 1, 1, 1, ca());
ConvTest2b1b(2, 1, 1, 3, 3, 1, 1, ca());
ConvTest2b1b(2, 2, 2, 3, 3, 1, 1, ca());
}
TEST(QConv, 2b1bConvTestRandomized) {
auto rca = []() {
ConvArgs r;
r.pad_l = randInt(0, 3);
r.pad_r = randInt(0, 3);
r.pad_t = randInt(0, 3);
r.pad_b = randInt(0, 3);
r.stride_w = randInt(1, 3);
r.stride_h = randInt(1, 3);
return r;
};
for (auto i = 0; i < 10; ++i) {
ConvTest2b1b(randInt(1, 64) * 8,
randInt(1, 4),
randInt(1, 4),
randInt(5, 12),
randInt(5, 12),
randInt(1, 64),
randInt(1, 2),
rca());
// Test 3x3 path.
ConvTest2b1b(randInt(1, 64) * 8,
3,
3,
randInt(5, 12),
randInt(5, 12),
randInt(1, 64),
randInt(1, 2),
rca());
// Test 3x3s2 path.
ConvTest2b1b(randInt(1, 64) * 8,
3,
3,
randInt(5, 12),
randInt(5, 12),
randInt(1, 64),
randInt(1, 2),
rca());
// Test 3x3 path with packing.
ConvTest2b1b(randInt(1, 64) * 8,
3,
3,
randInt(5, 12),
randInt(5, 12),
randInt(1, 8) * kGEMMTileSize,
randInt(1, 2),
rca());
// Test 1x1 path
ConvTest2b1b(randInt(1, 64) * 8,
1,
1,
randInt(5, 12),
randInt(5, 12),
randInt(1, 64),
randInt(1, 2),
ca());
// Test 1x1 with direct packing
ConvTest2b1b(randInt(1, 64) * 8,
1,
1,
randInt(5, 12),
randInt(5, 12),
randInt(1, 4) * kGEMMTileSize,
randInt(1, 2),
ca());
// Entirely arbitrary, no padding codepath.
ConvTest2b1b(randInt(1, 64) * 8,
randInt(1, 4),
randInt(1, 4),
randInt(5, 12),
randInt(5, 12),
randInt(1, 128),
randInt(1, 2),
rca());
// Entirely arbitrary, mixed codepath.
ConvTest2b1b(randInt(1, 64),
randInt(1, 4),
randInt(1, 4),
randInt(5, 12),
randInt(5, 12),
randInt(1, 128),
randInt(1, 2),
rca());
}
}
} // namespace caffe2