| #ifndef CAFFE2_OPERATORS_TT_LINEAR_OP_H_ | 
 | #define CAFFE2_OPERATORS_TT_LINEAR_OP_H_ | 
 |  | 
 | #ifdef CAFFE2_USE_MKL | 
 | #include <mkl.h> | 
 | #endif // CAFFE2_USE_MKL | 
 |  | 
 | #include "Eigen/Core" | 
 | #include "Eigen/Dense" | 
 | #include "caffe2/core/context.h" | 
 | #include "caffe2/core/operator.h" | 
 | #include "caffe2/utils/eigen_utils.h" | 
 | #include "caffe2/utils/math.h" | 
 |  | 
 | namespace caffe2 { | 
 |  | 
 | template <typename T, class Context, class Engine = DefaultEngine> | 
 | class TTLinearOp final : public Operator<Context> { | 
 |  public: | 
 |   USE_OPERATOR_CONTEXT_FUNCTIONS; | 
 |   template <class... Args> | 
 |   explicit TTLinearOp(Args&&... args) | 
 |       : Operator<Context>(std::forward<Args>(args)...), | 
 |         inp_sizes_(this->template GetRepeatedArgument<int>("inp_sizes")), | 
 |         out_sizes_(this->template GetRepeatedArgument<int>("out_sizes")), | 
 |         tt_ranks_(this->template GetRepeatedArgument<int>("tt_ranks")), | 
 |         Y_temp_(unique_ptr<Blob>(new Blob())) {} | 
 |   ~TTLinearOp() {} | 
 |  | 
 |   bool RunOnDevice() override { | 
 |     const auto& X = Input(0); // Input array | 
 |     const auto& b = Input(1); // Bias array | 
 |     const auto& cores = Input(2); // 1D array containing the TT-cores | 
 |  | 
 |     CAFFE_ENFORCE(X.dim() > 1, "Number of dimensions in X: ", X.dim()); | 
 |     CAFFE_ENFORCE(b.dim() == 1, "Number of dimensions in b: ", b.dim()); | 
 |     CAFFE_ENFORCE( | 
 |         inp_sizes_.size() == out_sizes_.size(), | 
 |         "inp_sizes has size: ", | 
 |         inp_sizes_.size(), | 
 |         ", out_sizes has size: ", | 
 |         out_sizes_.size()); | 
 |     CAFFE_ENFORCE( | 
 |         cores.dim() == 1, "Number of dimensions in cores: ", cores.dim()); | 
 |     // batch size | 
 |     const int batch_size = X.dim() > 1 ? X.dim32(0) : 1; | 
 |  | 
 |     // dimension d of tensors | 
 |     const int d = inp_sizes_.size(); | 
 |  | 
 |     // Keep track of index of current core in multiplication | 
 |     int cores_idx = 0; | 
 |  | 
 |     // Temporary buffer to facilitate multiplication of TT-cores with input | 
 |     auto Y_buf = BlobGetMutableTensor(Y_temp_.get(), Context::GetDeviceType()); | 
 |     Y_buf->ResizeLike(X); | 
 |     Y_buf->CopyFrom(X); | 
 |     Tensor* Y; | 
 |  | 
 |     // The overall forward pass involves multiplication with each core, where | 
 |     // each core has sizes dictated by inp_sizes_ and out_sizes_. Each core thus | 
 |     // has size inp_sizes_[i] * tt_ranks_[i] * tt_ranks_[i + 1] * out_sizes_[i]. | 
 |     for (int i = (d - 1); i >= 0; --i) { | 
 |       int curr_rows = inp_sizes_[i] * tt_ranks_[i + 1]; | 
 |       int curr_cols = tt_ranks_[i] * out_sizes_[i]; | 
 |  | 
 |       // TODO Replace by Reshape(), once wrappers are written | 
 |       Y_buf->Resize(Y_buf->numel() / curr_rows, curr_rows); | 
 |       Y = Output( | 
 |           0, {Y_buf->numel() / curr_rows, curr_cols}, at::dtype<float>()); | 
 |  | 
 |       // Defensive checks | 
 |       CAFFE_ENFORCE(Y_buf->numel() % curr_rows == 0, Y_buf->numel(), curr_rows); | 
 |       CAFFE_ENFORCE( | 
 |           cores_idx + curr_rows * curr_cols <= cores.numel(), | 
 |           cores_idx + curr_rows * curr_cols, | 
 |           cores.numel()); | 
 |  | 
 |       // Multiply ith core with the intermediate output | 
 |       math::Gemm<float, Context, Engine>( | 
 |           CblasNoTrans, | 
 |           CblasNoTrans, | 
 |           Y_buf->numel() / curr_rows, | 
 |           curr_cols, | 
 |           curr_rows, | 
 |           1, | 
 |           Y_buf->template data<float>(), | 
 |           cores.template data<float>() + cores_idx, | 
 |           0, | 
 |           Y->template mutable_data<float>(), | 
 |           &context_); | 
 |  | 
 |       CAFFE_ENFORCE(Y->numel() % out_sizes_[i] == 0, Y->numel(), out_sizes_[i]); | 
 |  | 
 |       // TODO Add GPU support by writing a generic wrapper. | 
 |       auto Y_mat = EigenMatrixMap<float>( | 
 |           Y->template mutable_data<float>(), | 
 |           Y->numel() / out_sizes_[i], | 
 |           out_sizes_[i]); | 
 |       Y_mat = ConstEigenMatrixMap<float>( | 
 |                   Y->template data<float>(), | 
 |                   out_sizes_[i], | 
 |                   Y->numel() / out_sizes_[i]) | 
 |                   .transpose() | 
 |                   .eval(); | 
 |  | 
 |       // Resize operation | 
 |       Y_buf->Resize(Y->dim32(0), Y->dim32(1)); | 
 |       context_.template CopyFromCPU<float>( | 
 |           Y->numel(), | 
 |           Y->template data<float>(), | 
 |           Y_buf->template mutable_data<float>()); | 
 |  | 
 |       cores_idx += curr_rows * curr_cols; | 
 |     } | 
 |  | 
 |     // TODO Add GPU support by writing a generic wrapper. | 
 |     auto Y_mat = EigenMatrixMap<float>( | 
 |         Y->template mutable_data<float>(), batch_size, Y->numel() / batch_size); | 
 |     Y_mat = ConstEigenMatrixMap<float>( | 
 |                 Y->template data<float>(), Y->numel() / batch_size, batch_size) | 
 |                 .transpose() | 
 |                 .eval(); | 
 |     // TODO Replace by Reshape(), once wrappers are written | 
 |     Y = Output(0, {batch_size, Y->numel() / batch_size}, at::dtype<float>()); | 
 |  | 
 |     // Check that output size of Y is the element-wise product of out_sizes | 
 |     int prod_out_sizes = 1; | 
 |     // NOLINTNEXTLINE(clang-diagnostic-sign-compare) | 
 |     for (int i = 0; i < out_sizes_.size(); i++) { | 
 |       prod_out_sizes *= out_sizes_[i]; | 
 |     } | 
 |     CAFFE_ENFORCE( | 
 |         Y->dim32(1) == prod_out_sizes, | 
 |         "Output dimension of Y: ", | 
 |         Y->dim32(1), | 
 |         ", product of out_sizes: ", | 
 |         prod_out_sizes); | 
 |  | 
 |     // Add bias term | 
 |     if (bias_multiplier_.numel() != batch_size) { | 
 |       // If the helper bias multiplier is not M, reshape and fill it with one. | 
 |       ReinitializeTensor( | 
 |           &bias_multiplier_, | 
 |           {batch_size}, | 
 |           at::dtype<T>().device(Context::GetDeviceType())); | 
 |       math::Set<T, Context>( | 
 |           batch_size, | 
 |           static_cast<T>(1), | 
 |           bias_multiplier_.template mutable_data<T>(), | 
 |           &context_); | 
 |     } | 
 |     math::Gemm<T, Context, Engine>( | 
 |         CblasNoTrans, | 
 |         CblasNoTrans, | 
 |         Y->dim32(0), | 
 |         Y->dim32(1), | 
 |         1, | 
 |         1, | 
 |         bias_multiplier_.template data<T>(), | 
 |         b.template data<T>(), | 
 |         1, | 
 |         Y->template mutable_data<T>(), | 
 |         &context_); | 
 |     return true; | 
 |   } | 
 |  | 
 |  protected: | 
 |   Tensor bias_multiplier_; | 
 |   std::vector<int> inp_sizes_; | 
 |   std::vector<int> out_sizes_; | 
 |   std::vector<int> tt_ranks_; | 
 |   std::unique_ptr<Blob> Y_temp_; | 
 | }; | 
 |  | 
 | // TODO: Complete after verifying utility of TT-layer's forward pass. | 
 | template <typename T, class Context, class Engine = DefaultEngine> | 
 | class TTLinearGradientOp : public Operator<Context> { | 
 |  public: | 
 |   USE_OPERATOR_CONTEXT_FUNCTIONS; | 
 |   template <class... Args> | 
 |   explicit TTLinearGradientOp(Args&&... args) | 
 |       : Operator<Context>(std::forward<Args>(args)...) {} | 
 |   ~TTLinearGradientOp() {} | 
 |  | 
 |   bool RunOnDevice() override { | 
 |     return false; | 
 |   } | 
 |  | 
 |  protected: | 
 |   Tensor bias_multiplier_{Context::GetDeviceType()}; | 
 | }; | 
 |  | 
 | } // namespace caffe2 | 
 |  | 
 | #endif // CAFFE2_OPERATORS_TT_LINEAR_OP_H_ |