fix bug in pool_dnnlowp_op_avx2.cc (#18141)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18141

VLEN should've been 32

Reviewed By: jianyuh

Differential Revision: D14510780

fbshipit-source-id: ddf12746e1c69677a268432432ddb088cc210084
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
index 92d0816..8c35ebd 100644
--- a/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
+++ b/caffe2/quantization/server/pool_dnnlowp_op_avx2.cc
@@ -34,7 +34,7 @@
       wstart = wstart > 0 ? wstart : 0;
 
       uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels;
-      constexpr int VLEN = 8;
+      constexpr int VLEN = 32;
       // vectorized loop
       for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) {
         __m256i Y_v = _mm256_setzero_si256();