Adding TensorIndexTuple and TensorTupleReduceOP backend (ArgMax/Min) for sycl; fixing the address space issue for const TensorMap; converting all discard_write to write due to data missmatch.

commit: f84963ed95ff277bf3abb2e2517b3017a25ccf3f [log] [tgz]
author: Mehdi Goli <mehdi.goli@codeplay.com> Tue Mar 07 14:27:10 2017 +0000
committer: Mehdi Goli <mehdi.goli@codeplay.com> Tue Mar 07 14:27:10 2017 +0000
tree: b9616be8fe4f8048287a147d070288701457ea3c
parent: 8296b87d7bd98c19c6064241880691f164790ede [diff]
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index d06f40c..e81001c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h

@@ -119,6 +119,12 @@
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
+  // required by sycl
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
+    return m_impl;
+  }
+
+
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
 };
@@ -222,7 +228,7 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_orig_impl(op.expression(), device),
         m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
-        m_return_dim(op.return_dim()) {
+        m_return_dim(op.return_dim()), m_device(device) {
 
     gen_strides(m_orig_impl.dimensions(), m_strides);
     if (Layout == static_cast<int>(ColMajor)) {
@@ -252,7 +258,16 @@
     return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
   }
 
+  #ifndef EIGEN_USE_SYCL
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  #else // following functions are required by sycl
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}
+  const Device& device() const{return m_device;}
+  #endif
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
@@ -292,6 +307,8 @@
   StrideDims m_strides;
   Index m_stride_mod;
   Index m_stride_div;
+ // required by sycl
+  const Device& m_device;
 };
 
 } // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
new file mode 100644
index 0000000..90cbe00
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h

@@ -0,0 +1,146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ *  TensorArgMaxSycl.h
+ * \brief:
+ *  TensorArgMaxSycl
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP
+namespace Eigen {
+namespace internal {
+  template<typename Dims, typename XprType>
+  struct eval<TensorTupleReducerDeviceOp<Dims, XprType>, Eigen::Dense>
+  {
+    typedef const TensorTupleReducerDeviceOp<Dims, XprType>& type;
+  };
+
+  template<typename Dims, typename XprType>
+  struct nested<TensorTupleReducerDeviceOp<Dims, XprType>, 1,
+                typename eval<TensorTupleReducerDeviceOp<Dims, XprType> >::type>
+  {
+    typedef TensorTupleReducerDeviceOp<Dims, XprType> type;
+  };
+
+template<typename StrideDims, typename XprType>
+struct traits<TensorTupleReducerDeviceOp<StrideDims, XprType> > : public traits<XprType>
+{
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+
+}// end namespace internal
+template<typename StrideDims, typename XprType>
+class TensorTupleReducerDeviceOp : public TensorBase<TensorTupleReducerDeviceOp<StrideDims, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorTupleReducerDeviceOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorTupleReducerDeviceOp>::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr,
+                                                              const int return_dim,
+                                                              const StrideDims& strides,
+                                                              const Index& stride_mod, const Index& stride_div)
+          :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {}
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  int return_dim() const { return m_return_dim; }
+
+  EIGEN_DEVICE_FUNC
+  const StrideDims& strides() const { return m_strides; }
+
+  EIGEN_DEVICE_FUNC
+  const Index& stride_mod() const { return m_stride_mod; }
+
+  EIGEN_DEVICE_FUNC
+  const Index& stride_div() const { return m_stride_div; }
+
+  protected:
+    typename Eigen::internal::remove_all<typename
+    XprType::Nested
+    >::type m_xpr;
+    const int m_return_dim;
+    const StrideDims& m_strides;
+    const Index m_stride_mod;
+    const Index m_stride_div;
+};
+
+
+// Eval as rvalue
+template<typename StrideDims, typename ArgType>
+struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>
+{
+  typedef TensorTupleReducerDeviceOp<StrideDims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Index Scalar;
+  typedef Index CoeffReturnType;
+  typedef typename XprType::CoeffReturnType TupleType;
+  typedef typename TensorEvaluator<ArgType, SyclKernelDevice>::Dimensions Dimensions;
+
+  enum {
+    IsAligned =  false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,  const SyclKernelDevice& device)
+      : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
+      m_stride_div(op.stride_div()){}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    const TupleType v = m_impl.coeff(index);
+    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
+  }
+typedef typename MakeGlobalPointer<typename TensorEvaluator<ArgType , SyclKernelDevice>::CoeffReturnType >::Type ptr_Dev_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type  data() const { return const_cast<ptr_Dev_type>(m_impl.data()); }
+
+protected:
+ TensorEvaluator<ArgType , SyclKernelDevice> m_impl;
+ const int m_return_dim;
+ const StrideDims& m_strides;
+ const Index& m_stride_mod;
+ const Index& m_stride_div;
+};
+} // end namespace Eigen
+#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index fcd7d4d..5b4c3c5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h

@@ -11,7 +11,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
+ * TensorTensorContractionsycl.h
  *
  * \brief:
  *  TensorContractionsycl
@@ -389,9 +389,9 @@
       cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
        KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
        RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
-       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
+       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors,
           localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
-          m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
+          m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice()));
     });
     self.device().asynchronousExec();
   }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 66ffd81..5db16d5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h

@@ -45,7 +45,7 @@
   void operator()(cl::sycl::nd_item<2> itemID) {
     typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
     auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
 
     auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
     auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@@ -103,7 +103,7 @@
   void operator()(cl::sycl::nd_item<3> itemID) {
     typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
     auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
 
     auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
     auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@@ -173,7 +173,7 @@
   void operator()(cl::sycl::nd_item<3> itemID) {
     typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
     auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
 
     auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
     auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
@@ -339,8 +339,8 @@
       // create input tuple of accessors
       InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
 
-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
-      OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
+      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutputAccessorType;
+      OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, data);
       typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
       KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 964222a..2582184 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h

@@ -41,9 +41,8 @@
     size_t m_i;
     size_t m_offset;
   };
-
+template<typename AccType>
   struct memsetkernelFunctor{
-   typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
    AccType m_acc;
    const ptrdiff_t buff_offset;
    const size_t m_rng, m_c;
@@ -55,15 +54,19 @@
 
   };
 
+  //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU  and intel GPU)
 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
   auto devices = cl::sycl::device::get_devices();
   std::vector<cl::sycl::device>::iterator it =devices.begin();
   while(it!=devices.end()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
+    ///FIXME: Currently there is a bug in amd cpu OpenCL
     auto s=  (*it).template get_info<cl::sycl::info::device::vendor>();
     std::transform(s.begin(), s.end(), s.begin(), ::tolower);
     if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
       it=devices.erase(it);
+      //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue.
+    }else if((*it).is_gpu() && s.find("intel")!=std::string::npos){
+      it=devices.erase(it);
     }
     else{
       ++it;
@@ -112,6 +115,154 @@
 }))
 #endif
   {}
+//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop
+template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
+      std::lock_guard<std::mutex> lock(mutex_);
+      auto host_acc= find_buffer(dst)->second. template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::host_buffer>();
+      ::memcpy(host_acc.get_pointer(), src, n);
+}
+
+template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  // Assuming that the dst is the start of the destination pointer
+auto it =find_buffer(src);
+auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
+offset/=sizeof(Index);
+size_t rng, GRange, tileSize;
+parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
+  auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
+  m_queue.submit([&](cl::sycl::handler &cgh) {
+    auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
+    auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
+    typedef decltype(src_acc) read_accessor;
+    typedef decltype(dst_acc) write_accessor;
+    cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
+  });
+  synchronize();
+
+}
+
+EIGEN_STRONG_INLINE void synchronize() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  m_queue.wait_and_throw(); //pass
+}
+EIGEN_STRONG_INLINE void asynchronousExec() const {
+  ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
+  //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
+  std::lock_guard<std::mutex> lock(mutex_);
+  m_queue.wait_and_throw(); //pass
+
+}
+
+template<typename Index>
+EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange)  const {
+  tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
+  auto s=  m_queue.get_device().template get_info<cl::sycl::info::device::vendor>();
+  std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+  if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
+  }
+  rng = n;
+  if (rng==0) rng=static_cast<Index>(1);
+  GRange=rng;
+  if (tileSize>GRange) tileSize=GRange;
+  else if(GRange>tileSize){
+    Index xMode =  static_cast<Index>(GRange % tileSize);
+    if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
+  }
+}
+
+/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+template<typename Index>
+EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
+  Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
+  if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
+  }
+  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+  tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
+  rng1=dim1;
+  if (rng1==0 ) rng1=static_cast<Index>(1);
+  GRange1=rng1;
+  if (tileSize1>GRange1) tileSize1=GRange1;
+  else if(GRange1>tileSize1){
+    Index xMode =  static_cast<Index>(GRange1 % tileSize1);
+    if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+  }
+  tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
+  rng0 = dim0;
+  if (rng0==0 ) rng0=static_cast<Index>(1);
+  GRange0=rng0;
+  if (tileSize0>GRange0) tileSize0=GRange0;
+  else if(GRange0>tileSize0){
+    Index xMode =  static_cast<Index>(GRange0 % tileSize0);
+    if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+  }
+}
+
+
+
+/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
+template<typename Index>
+EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
+  Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
+  if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
+  }
+  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+  tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
+  rng2=dim2;
+  if (rng2==0 ) rng1=static_cast<Index>(1);
+  GRange2=rng2;
+  if (tileSize2>GRange2) tileSize2=GRange2;
+  else if(GRange2>tileSize2){
+    Index xMode =  static_cast<Index>(GRange2 % tileSize2);
+    if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
+  }
+  pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
+  tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
+  rng1=dim1;
+  if (rng1==0 ) rng1=static_cast<Index>(1);
+  GRange1=rng1;
+  if (tileSize1>GRange1) tileSize1=GRange1;
+  else if(GRange1>tileSize1){
+    Index xMode =  static_cast<Index>(GRange1 % tileSize1);
+    if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
+  }
+  tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
+  rng0 = dim0;
+  if (rng0==0 ) rng0=static_cast<Index>(1);
+  GRange0=rng0;
+  if (tileSize0>GRange0) tileSize0=GRange0;
+  else if(GRange0>tileSize0){
+    Index xMode =  static_cast<Index>(GRange0 % tileSize0);
+    if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
+  }
+}
+
+
+EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return m_queue.get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
+//  return stream_->deviceProperties().multiProcessorCount;
+}
+EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+
+//  return stream_->deviceProperties().maxThreadsPerBlock;
+}
+EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  // OpenCL doesnot have such concept
+  return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
+//  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+}
+EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return m_queue.get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
+//  return stream_->deviceProperties().sharedMemPerBlock;
+}
 
   /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
   /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key
@@ -119,10 +270,10 @@
   /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
   /// The device pointer would be deleted by calling deallocate function.
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    std::lock_guard<std::mutex> lock(mutex_);
     auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes));
     auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
     buf.set_final_data(nullptr);
-    std::lock_guard<std::mutex> lock(mutex_);
     buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
     return static_cast<void*>(ptr);
   }
@@ -193,48 +344,13 @@
   /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
   template<typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange)  const {
-    tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
-    auto s=  sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
-    }
-    rng = n;
-    if (rng==0) rng=static_cast<Index>(1);
-    GRange=rng;
-    if (tileSize>GRange) tileSize=GRange;
-    else if(GRange>tileSize){
-      Index xMode =  static_cast<Index>(GRange % tileSize);
-      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
-    }
+  m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange);
   }
 
   /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
   template<typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
-    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
-    }
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
-    rng1=dim1;
-    if (rng1==0 ) rng1=static_cast<Index>(1);
-    GRange1=rng1;
-    if (tileSize1>GRange1) tileSize1=GRange1;
-    else if(GRange1>tileSize1){
-      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
-    }
-    tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
-    rng0 = dim0;
-    if (rng0==0 ) rng0=static_cast<Index>(1);
-    GRange0=rng0;
-    if (tileSize0>GRange0) tileSize0=GRange0;
-    else if(GRange0>tileSize0){
-      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
-    }
+    m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1);
   }
 
 
@@ -242,39 +358,8 @@
   /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
   template<typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
-    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
-    }
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
-    rng2=dim2;
-    if (rng2==0 ) rng1=static_cast<Index>(1);
-    GRange2=rng2;
-    if (tileSize2>GRange2) tileSize2=GRange2;
-    else if(GRange2>tileSize2){
-      Index xMode =  static_cast<Index>(GRange2 % tileSize2);
-      if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
-    }
-    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
-    rng1=dim1;
-    if (rng1==0 ) rng1=static_cast<Index>(1);
-    GRange1=rng1;
-    if (tileSize1>GRange1) tileSize1=GRange1;
-    else if(GRange1>tileSize1){
-      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
-    }
-    tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
-    rng0 = dim0;
-    if (rng0==0 ) rng0=static_cast<Index>(1);
-    GRange0=rng0;
-    if (tileSize0>GRange0) tileSize0=GRange0;
-    else if(GRange0>tileSize0){
-      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
-    }
+    m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2);
+
   }
   /// allocate device memory
   EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
@@ -319,8 +404,7 @@
   /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
   /// this buffer is accessed, the data will be copied to the device.
   template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
-    auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
-    ::memcpy(host_acc.get_pointer(), src, n);
+     m_queue_stream->memcpyHostToDevice(dst,src,n);
   }
   /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
   /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
@@ -329,21 +413,7 @@
   /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
   /// to the cpu only once per function call.
   template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
-    auto it = m_queue_stream->find_buffer(src);
-    auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
-    offset/=sizeof(Index);
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
-    // Assuming that the dst is the start of the destination pointer
-    auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
-    sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
-      auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      typedef decltype(src_acc) read_accessor;
-      typedef decltype(dst_acc) write_accessor;
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
-    });
-    synchronize();
+    m_queue_stream->memcpyDeviceToHost(dst,src,n);
   }
   /// returning the sycl queue
   EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
@@ -366,8 +436,9 @@
     :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
 
     void operator()(cl::sycl::handler &cgh) const {
-      auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c));
+      auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
+      typedef decltype(buf_acc) AccType;
+      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor<AccType>(buf_acc, buff_offset, rng, c));
     }
   };
 
@@ -403,14 +474,13 @@
   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
 
   EIGEN_STRONG_INLINE void synchronize() const {
-    sycl_queue().wait_and_throw(); //pass
+    m_queue_stream->synchronize(); //pass
   }
 
   EIGEN_STRONG_INLINE void asynchronousExec() const {
     ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
     //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
-    sycl_queue().wait_and_throw(); //pass
-
+    m_queue_stream->asynchronousExec();
   }
   // This function checks if the runtime recorded an error for the
   // underlying stream device.
@@ -418,8 +488,10 @@
     return m_queue_stream->ok();
   }
 };
-
-
+// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout.
+// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator
+// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
+struct SyclKernelDevice:DefaultDevice{};
 
 }  // end namespace Eigen
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d641581..8516b37 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h

@@ -193,7 +193,12 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     eigen_assert(m_data);
+#ifndef __SYCL_DEVICE_ONLY__
     return loadConstant(m_data+index);
+#else
+    CoeffReturnType tmp = m_data[index];
+    return tmp;
+#endif
   }
 
   template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b5ef31d..77c9c6c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h

@@ -124,7 +124,9 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Tuple& operator= (const Tuple& rhs) {
+  #ifndef __SYCL_DEVICE_ONLY__
     if (&rhs == this) return *this;
+  #endif
     first = rhs.first;
     second = rhs.second;
     return *this;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index c9c7acf..9489925 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h

@@ -35,7 +35,7 @@
             /* Two accessors are used: one to the buffer that is being reduced,
              * and a second to local memory, used to store intermediate data. */
             auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
-            auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
+            auto aOut =bufOut.template get_access<cl::sycl::access::mode::write>(h);
             typedef decltype(aI) InputAccessor;
             typedef decltype(aOut) OutputAccessor;
             typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
@@ -158,7 +158,7 @@
       typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
       // create a tuple of accessors from Evaluator
       Tuple_of_Acc tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
+      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, output);
       ptrdiff_t out_offset = dev.get_offset(output);
       Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
       cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 9d5a6d4..3d62706 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h

@@ -32,6 +32,8 @@
 
 
 namespace Eigen {
+  template<typename StrideDims, typename XprType> class TensorTupleReducerDeviceOp;
+  template<typename StrideDims, typename ArgType> struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, SyclKernelDevice>;
 namespace TensorSycl {
 namespace internal {
 
@@ -48,6 +50,13 @@
   typedef T Type;
 };
 
+template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
+  static const size_t Res =X;
+};
+template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
+  static const size_t Res =Y;
+};
+
 }
 }
 }
@@ -80,6 +89,9 @@
 /// this is used for extracting tensor reduction
 #include "TensorReductionSycl.h"
 
+// TensorArgMaxSycl.h
+#include "TensorArgMaxSycl.h"
+
 /// this is used for extracting tensor convolution
 #include "TensorConvolutionSycl.h"
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index 9476c0e..d6ac7b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h

@@ -103,7 +103,7 @@
 #undef KERNELBROKERCONVERT
 
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp
-#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(CVQual, ExprNode)\
+#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\
 template <typename Expr>\
 struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
   typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
@@ -111,15 +111,17 @@
 
 
 // TensorForcedEvalOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp)
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp)
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp)
 
 // TensorLayoutSwapOp
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorLayoutSwapOp)
-KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorLayoutSwapOp)
-#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp)
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp)
 
-
+//TensorIndexTupleOp
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp)
+KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp)
+#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP
 
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
 #define KERNELBROKERCONVERTREDUCTION(CVQual)\
@@ -132,6 +134,18 @@
 KERNELBROKERCONVERTREDUCTION()
 #undef KERNELBROKERCONVERTREDUCTION
 
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
+#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\
+template <typename OP, typename Dim, typename subExpr>\
+struct ConvertToDeviceExpression<CVQual TensorTupleReducerOp<OP, Dim, subExpr> > {\
+  typedef CVQual TensorTupleReducerOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type> Type;\
+};
+
+KERNELBROKERCONVERTTUPLEREDUCTION(const)
+KERNELBROKERCONVERTTUPLEREDUCTION()
+#undef KERNELBROKERCONVERTTUPLEREDUCTION
+
+//TensorSlicingOp
 #define KERNELBROKERCONVERTSLICEOP(CVQual)\
 template<typename StartIndices, typename Sizes, typename XprType>\
 struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
@@ -142,7 +156,7 @@
 KERNELBROKERCONVERTSLICEOP()
 #undef KERNELBROKERCONVERTSLICEOP
 
-
+//TensorStridingSlicingOp
 #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
 struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
@@ -153,7 +167,6 @@
 KERNELBROKERCONVERTERSLICESTRIDEOP()
 #undef KERNELBROKERCONVERTERSLICESTRIDEOP
 
-
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
 #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
 template <DenseIndex DimId, typename Expr>\
@@ -164,9 +177,6 @@
 KERNELBROKERCONVERTCHIPPINGOP()
 #undef KERNELBROKERCONVERTCHIPPINGOP
 
-
-
-
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp
 #define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>\
@@ -188,8 +198,6 @@
 KERNELBROKERCONVERTVOLUMEPATCHOP()
 #undef KERNELBROKERCONVERTVOLUMEPATCHOP
 
-
-
 }  // namespace internal
 }  // namespace TensorSycl
 }  // namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index af4eb5f..24cc23f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h

@@ -65,7 +65,6 @@
   : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
 };
 
-
 TENSORMAP(const)
 TENSORMAP()
 #undef TENSORMAP
@@ -83,6 +82,7 @@
   ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
   : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
 };
+
 TENSORMAPFIXEDSIZE(const)
 TENSORMAPFIXEDSIZE()
 #undef TENSORMAPFIXEDSIZE
@@ -189,9 +189,6 @@
  ASSIGN()
  #undef ASSIGN
 
-
-
-
  /// specialisation of the \ref ExprConstructor struct when the node type is
  /// const TensorAssignOp
  #define CONVERSIONEXPRCONST(CVQual)\
@@ -252,8 +249,6 @@
 FORCEDEVAL()
 #undef FORCEDEVAL
 
-
-
 #define TENSORCUSTOMUNARYOP(CVQual)\
 template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
@@ -274,13 +269,6 @@
 TENSORCUSTOMUNARYOP()
 #undef TENSORCUSTOMUNARYOP
 
-template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
-  static const size_t Res =X;
-};
-template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
-  static const size_t Res =Y;
-};
-
 /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
 #define SYCLREDUCTIONEXPR(CVQual)\
 template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
@@ -299,6 +287,35 @@
 SYCLREDUCTIONEXPR()
 #undef SYCLREDUCTIONEXPR
 
+/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp
+/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP.
+#define SYCLTUPLEREDUCTIONEXPR(CVQual)\
+template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorTupleReducerOp<OP, Dim, OrigExpr>,\
+CVQual PlaceHolder<CVQual TensorTupleReducerOp<OP, Dim, DevExpr>, N>, Params...> {\
+  static const auto NumRedDims= TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr> , MakeGlobalPointer>::NumDimensions;\
+  static const auto NumIndices= ValueCondition<NumRedDims==0, 1, NumRedDims>::Res;\
+static const int Layout =static_cast<int>(Eigen::internal::traits<TensorReductionOp<OP, Dim, const TensorIndexTupleOp<OrigExpr>, MakeGlobalPointer>>::Layout);\
+  typedef CVQual TensorMap<\
+                          Tensor<typename TensorIndexTupleOp<OrigExpr>::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp<OP, Dim, OrigExpr>::Index>,\
+                          Layout,\
+                          MakeGlobalPointer\
+                          > XprType;\
+  typedef typename TensorEvaluator<const TensorIndexTupleOp<OrigExpr> , SyclKernelDevice>::Dimensions InputDimensions;\
+  static const int NumDims = Eigen::internal::array_size<InputDimensions>::value;\
+  typedef array<Index, NumDims> StrideDims;\
+  typedef const TensorTupleReducerDeviceOp<StrideDims, XprType> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get<N>(t)), fd.dimensions()),\
+    fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\
+    }\
+};
+
+SYCLTUPLEREDUCTIONEXPR(const)
+SYCLTUPLEREDUCTIONEXPR()
+#undef SYCLTUPLEREDUCTIONEXPR
 
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
@@ -319,15 +336,18 @@
   :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
 };
 
+//TensorContractionOp
 SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
 SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
+//TensorConvolutionOp
 SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
 SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
+//TensorCustomBinaryOp
 SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
 SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
 #undef SYCLCONTRACTCONVCUSBIOPS
 
-
+//TensorSlicingOp
 #define SYCLSLICEOPEXPR(CVQual)\
 template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
@@ -344,7 +364,7 @@
 SYCLSLICEOPEXPR()
 #undef SYCLSLICEOPEXPR
 
-
+//TensorStridingSlicingOp
 #define SYCLSLICESTRIDEOPEXPR(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
@@ -361,6 +381,7 @@
 SYCLSLICESTRIDEOPEXPR()
 #undef SYCLSLICESTRIDEOPEXPR
 
+//TensorReshapingOp and TensorShufflingOp
 #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
 template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
@@ -373,13 +394,15 @@
   : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
 };
 
+// TensorReshapingOp
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
-
+// TensorShufflingOp
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
 SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
 #undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
 
+//TensorPaddingOp
 #define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
 template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
 struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
@@ -392,11 +415,11 @@
   : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
 };
 
+//TensorPaddingOp
 SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
 SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
 #undef SYCLPADDINGOPEXPRCONST
 
-
 // TensorChippingOp
 #define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
 template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
@@ -454,14 +477,12 @@
 SYCLTENSORVOLUMEPATCHOPEXPR()
 #undef SYCLTENSORVOLUMEPATCHOPEXPR
 
-
-
-// TensorLayoutSwapOp
-#define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\
+// TensorLayoutSwapOp and TensorIndexTupleOp
+#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\
 template<typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorLayoutSwapOp <OrigXprType> , CVQual TensorLayoutSwapOp<XprType>, Params... >{\
+struct ExprConstructor<CVQual ExprNode <OrigXprType> , CVQual ExprNode<XprType>, Params... >{\
   typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorLayoutSwapOp<typename my_xpr_type::Type> Type;\
+  typedef CVQual ExprNode<typename my_xpr_type::Type> Type;\
   my_xpr_type xprExpr;\
   Type expr;\
   template <typename FuncDetector>\
@@ -469,10 +490,14 @@
   : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\
 };
 
-SYCLTENSORLAYOUTSWAPOPEXPR(const)
-SYCLTENSORLAYOUTSWAPOPEXPR()
-#undef SYCLTENSORLAYOUTSWAPOPEXPR
+//TensorLayoutSwapOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp)
+//TensorIndexTupleOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp)
 
+#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR
 
 /// template deduction for \ref ExprConstructor struct
 template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index 5a6a8f4..fb95af5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h

@@ -147,7 +147,7 @@
 SYCLFORCEDEVALEXTACC()
 #undef SYCLFORCEDEVALEXTACC
 
-
+//TensorCustomUnaryOp
 #define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
 template <typename CustomUnaryFunc, typename XprType, typename Dev >\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
@@ -160,7 +160,7 @@
 SYCLCUSTOMUNARYOPEXTACC()
 #undef SYCLCUSTOMUNARYOPEXTACC
 
-
+//TensorCustomBinaryOp
 #define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
 template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
@@ -172,9 +172,6 @@
 SYCLCUSTOMBINARYOPEXTACC()
 #undef SYCLCUSTOMBIBARYOPEXTACC
 
-
-
-
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
 #define SYCLEVALTOEXTACC(CVQual)\
 template <typename Expr, typename Dev>\
@@ -188,15 +185,19 @@
 #undef SYCLEVALTOEXTACC
 
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXTACC(CVQual)\
+#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\
 template <typename OP, typename Dim, typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
+struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev> > {\
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<OP, Dim, Expr>, Dev>& eval)\
   RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };
+// TensorReductionOp
+SYCLREDUCTIONEXTACC(const,TensorReductionOp)
+SYCLREDUCTIONEXTACC(,TensorReductionOp)
 
-SYCLREDUCTIONEXTACC(const)
-SYCLREDUCTIONEXTACC()
+// TensorTupleReducerOp
+SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp)
+SYCLREDUCTIONEXTACC(,TensorTupleReducerOp)
 #undef SYCLREDUCTIONEXTACC
 
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
@@ -206,14 +207,14 @@
   static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
   RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
 };
-
+//TensorContractionOp
 SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
 SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
+//TensorConvolutionOp
 SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
 SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
 #undef SYCLCONTRACTIONCONVOLUTIONEXTACC
 
-
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 /// const TensorSlicingOp.
 #define SYCLSLICEOPEXTACC(CVQual)\
@@ -252,7 +253,6 @@
 SYCLTENSORCHIPPINGOPEXTACC()
 #undef SYCLTENSORCHIPPINGOPEXTACC
 
-
 // specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorImagePatchOp.
 #define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\
@@ -266,8 +266,6 @@
 SYCLTENSORIMAGEPATCHOPEXTACC()
 #undef SYCLTENSORIMAGEPATCHOPEXTACC
 
-
-
 // specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorVolumePatchOp.
 #define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\
@@ -281,21 +279,23 @@
 SYCLTENSORVOLUMEPATCHOPEXTACC()
 #undef SYCLTENSORVOLUMEPATCHOPEXTACC
 
-
 // specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorLayoutSwapOp.
-#define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\
+/// TensorLayoutSwapOp, TensorIndexTupleOp
+#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\
 template<typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorLayoutSwapOp<XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorLayoutSwapOp<XprType>, Dev>& eval)\
+struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<XprType>, Dev> >{\
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<XprType>, Dev>& eval)\
   RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
 };
 
-SYCLTENSORLAYOUTSWAPOPEXTACC(const)
-SYCLTENSORLAYOUTSWAPOPEXTACC()
-#undef SYCLTENSORLAYOUTSWAPOPEXTACC
+// TensorLayoutSwapOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp)
+//TensorIndexTupleOp
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp)
+SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp)
 
-
+#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC
 
 /// template deduction for \ref ExtractAccessor
 template <typename Evaluator>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index 9fcac5e..942e9d3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h

@@ -126,19 +126,19 @@
   typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
   DEFALTACTION(Evaluator)\
 };
-
+//TensorCustomUnaryOp
 SYCLEXTRFUNCCUSTOMUNARYOP(const)
 SYCLEXTRFUNCCUSTOMUNARYOP()
 #undef SYCLEXTRFUNCCUSTOMUNARYOP
 
-
+//TensorCustomBinaryOp
 #define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
 template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
   typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
   DEFALTACTION(Evaluator)\
 };
-
+//TensorCustomBinaryOp
 SYCLEXTRFUNCCUSTOMBIBARYOP(const)
 SYCLEXTRFUNCCUSTOMBIBARYOP()
 #undef SYCLEXTRFUNCCUSTOMBIBARYOP
@@ -177,7 +177,7 @@
 
 /// specialisation of the \ref FunctorExtractor struct when the node types are
 /// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(CVQual, ExprNode)\
+#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\
 template <typename Expr, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Expr>, Dev> > {\
   FunctorExtractor<TensorEvaluator<Expr, Dev> > xprExpr;\
@@ -185,13 +185,16 @@
   : xprExpr(expr.impl()) {}\
 };
 //TensorEvalToOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorEvalToOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorEvalToOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp)
 // TensorLayoutSwapOp
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorLayoutSwapOp)
-SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorLayoutSwapOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp)
+// TensorIndexTupleOp
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp)
+SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp)
 
-#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUT
+#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE
 
 template<typename Dim, size_t NumOutputDim> struct DimConstr {
 template<typename InDim>
@@ -202,10 +205,10 @@
   template<typename InDim>
     static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
 };
-
+//TensorReductionOp
 #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> >{\
   typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
   typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
   const Dimensions m_dimensions;\
@@ -213,12 +216,39 @@
   FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
   : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
 };
-
-
 SYCLEXTRFUNCREDUCTIONOP(const)
 SYCLEXTRFUNCREDUCTIONOP()
 #undef SYCLEXTRFUNCREDUCTIONOP
 
+//TensorTupleReducerOp
+#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\
+template<typename ReduceOp, typename Dims, typename ArgType, typename Device>\
+ struct FunctorExtractor<TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> >{\
+ typedef TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device> Evaluator;\
+ static const int  NumOutputDims= Eigen::internal::traits<TensorTupleReducerOp<ReduceOp, Dims, ArgType> >::NumDimensions;\
+ typedef typename Evaluator::StrideDims StrideDims;\
+ typedef typename Evaluator::Index Index;\
+ typedef typename Eigen::internal::conditional<NumOutputDims==0, DSizes<Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
+ const Dimensions m_dimensions;\
+ const int m_return_dim;\
+ const StrideDims m_strides;\
+ const Index m_stride_mod;\
+ const Index m_stride_div;\
+ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+ EIGEN_STRONG_INLINE  int return_dim() const {return m_return_dim;}\
+ EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}\
+ EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}\
+ EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}\
+ FunctorExtractor(const TensorEvaluator<CVQual TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>& expr)\
+ : m_dimensions(DimConstr<Dimensions, NumOutputDims>::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\
+   m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\
+};
+
+SYCLEXTRFUNCTUPLEREDUCTIONOP(const)
+SYCLEXTRFUNCTUPLEREDUCTIONOP()
+#undef SYCLEXTRFUNCTUPLEREDUCTIONOP
+
+//TensorContractionOp and TensorConvolutionOp
 #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
 template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
@@ -230,9 +260,10 @@
   : m_dimensions(expr.dimensions()) {}\
 };
 
-
+//TensorContractionOp
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
+//TensorConvolutionOp
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
 SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
 #undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
@@ -255,6 +286,7 @@
 SYCLEXTRFUNCTSLICEOP()
 #undef SYCLEXTRFUNCTSLICEOP
 
+//TensorStridingSlicingOp
 #define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
@@ -273,7 +305,7 @@
 SYCLEXTRFUNCTSLICESTRIDEOP()
 #undef SYCLEXTRFUNCTSLICESTRIDEOP
 
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
+// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory
 #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
 template<typename Param, typename XprType, typename Dev>\
 struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
@@ -284,9 +316,11 @@
   : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
 };
 
+//TensorReshapingOp
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
 
+//TensorShufflingOp
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
 SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
 #undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
@@ -343,6 +377,7 @@
 SYCLEXTRFUNCCHIPPINGOP()
 #undef SYCLEXTRFUNCCHIPPINGOP
 
+//TensorImagePatchOp
 #define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\
 template<DenseIndex Rows, DenseIndex Cols, typename XprType, typename Device>\
 struct FunctorExtractor<TensorEvaluator<CVQual TensorImagePatchOp<Rows, Cols, XprType>, Device> >{\
@@ -420,7 +455,6 @@
 #undef SYCLEXTRFUNCVOLUMEPATCHOP
 
 
-
 /// template deduction function for FunctorExtractor
 template <typename Evaluator>
 auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index 12237bf..e5b892f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h

@@ -72,7 +72,7 @@
 template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
  public:
   typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
+  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
   ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
   :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
   void operator()(cl::sycl::nd_item<1> itemID) {
@@ -85,8 +85,8 @@
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
-    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
+    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=static_cast<Index>(itemID.get_global_linear_id());
@@ -111,7 +111,7 @@
 class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
  public:
   typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
+  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> write_accessor;
   typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
   ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
     Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
@@ -126,8 +126,8 @@
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
-    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice> DeviceSelf;
+    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=static_cast<Index>(itemID.get_global_linear_id());
@@ -173,7 +173,7 @@
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=itemID.get_global_linear_id();
 
@@ -220,7 +220,7 @@
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
     /// the device_evaluator is detectable and recognisable on the device.
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::SyclKernelDevice>(device_self_expr, Eigen::SyclKernelDevice());
     /// const cast added as a naive solution to solve the qualifier drop error
     auto globalid=itemID.get_global_linear_id();
     auto scale = (rng*red_factor) + remaining;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index 330283b..234580c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h

@@ -114,27 +114,37 @@
 #undef SYCLCUSTOMBINARYOPLEAFCOUNT
 
 /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\
+#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\
 template <typename Expr>\
 struct LeafCount<CVQual ExprNode<Expr> > {\
   static const size_t Count = Num + CategoryCount<Expr>::Count;\
 };
 
-EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorEvalToOp, 1)
-EVALTOLAYOUTSWAPLEAFCOUNT(, TensorEvalToOp, 1)
-EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorLayoutSwapOp, 0)
-EVALTOLAYOUTSWAPLEAFCOUNT(, TensorLayoutSwapOp, 0)
-#undef EVALTOLAYOUTSWAPLEAFCOUNT
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0)
+
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0)
+EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0)
+
+#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT
 
 /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-#define REDUCTIONLEAFCOUNT(CVQual)\
+#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\
 template <typename OP, typename Dim, typename Expr>\
-struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\
+struct LeafCount<CVQual ExprNode<OP, Dim, Expr> > {\
     static const size_t Count =1;\
 };
 
-REDUCTIONLEAFCOUNT(const)
-REDUCTIONLEAFCOUNT()
+// TensorReductionOp
+REDUCTIONLEAFCOUNT(const,TensorReductionOp)
+REDUCTIONLEAFCOUNT(,TensorReductionOp)
+
+// tensor Argmax -TensorTupleReducerOp
+REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp)
+REDUCTIONLEAFCOUNT(, TensorTupleReducerOp)
+
 #undef REDUCTIONLEAFCOUNT
 
 /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
@@ -150,8 +160,6 @@
 CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
 #undef CONTRACTIONCONVOLUTIONLEAFCOUNT
 
-
-
 /// specialisation of the \ref LeafCount struct when the node type is  TensorSlicingOp
 #define SLICEOPLEAFCOUNT(CVQual)\
 template <typename StartIndices, typename Sizes, typename XprType>\
@@ -161,7 +169,6 @@
 SLICEOPLEAFCOUNT()
 #undef SLICEOPLEAFCOUNT
 
-
 /// specialisation of the \ref LeafCount struct when the node type is  TensorChippingOp
 #define CHIPPINGOPLEAFCOUNT(CVQual)\
 template <DenseIndex DimId, typename XprType>\
@@ -195,7 +202,6 @@
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>\
 struct LeafCount<CVQual TensorVolumePatchOp<Planes, Rows, Cols, XprType> >:CategoryCount<XprType>{};
 
-
 TENSORVOLUMEPATCHOPLEAFCOUNT(const)
 TENSORVOLUMEPATCHOPLEAFCOUNT()
 #undef TENSORVOLUMEPATCHOPLEAFCOUNT

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index 99d5289..9d5708f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h

@@ -171,19 +171,24 @@
 
 
 /// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorEvalToOp, TensorLayoutSwapOp
-#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\
+/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp
+#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\
 template <typename Expr, size_t N>\
 struct PlaceHolderExpression<CVQual ExprNode<Expr>, N> {\
   typedef CVQual ExprNode<typename CalculateIndex <N, Expr>::ArgType> Type;\
 };
 
-EVALTOLAYOUTSWAP(const, TensorEvalToOp)
-EVALTOLAYOUTSWAP(, TensorEvalToOp)
-EVALTOLAYOUTSWAP(const, TensorLayoutSwapOp)
-EVALTOLAYOUTSWAP(, TensorLayoutSwapOp)
+// TensorEvalToOp
+EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp)
+EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp)
+//TensorLayoutSwapOp
+EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp)
+EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp)
+//TensorIndexTupleOp
+EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp)
+EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp)
 
-#undef EVALTOLAYOUTSWAP
+#undef EVALTOLAYOUTSWAPINDEXTUPLE
 
 
 /// specialisation of the \ref PlaceHolderExpression when the node is
@@ -199,17 +204,24 @@
 #undef CHIPPINGOP
 
 /// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLREDUCTION(CVQual)\
+/// TensorReductionOp and TensorTupleReducerOp (Argmax)
+#define SYCLREDUCTION(CVQual, ExprNode)\
 template <typename OP, typename Dims, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\
-  typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\
+struct PlaceHolderExpression<CVQual ExprNode<OP, Dims, Expr>, N>{\
+  typedef CVQual PlaceHolder<CVQual ExprNode<OP, Dims,Expr>, N> Type;\
 };
-SYCLREDUCTION(const)
-SYCLREDUCTION()
+
+// tensor reduction
+SYCLREDUCTION(const, TensorReductionOp)
+SYCLREDUCTION(, TensorReductionOp)
+
+// tensor Argmax -TensorTupleReducerOp
+SYCLREDUCTION(const, TensorTupleReducerOp)
+SYCLREDUCTION(, TensorTupleReducerOp)
 #undef SYCLREDUCTION
 
 
+
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorReductionOp
 #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index cac7855..29c7818 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h

@@ -25,7 +25,6 @@
 
 namespace Eigen {
 namespace TensorSycl {
-
 template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
   typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
 
@@ -38,7 +37,7 @@
   void operator()(cl::sycl::nd_item<1> itemID) {
     typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
     auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+    auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::SyclKernelDevice>(device_expr.expr, Eigen::SyclKernelDevice());
     typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
     if (gId < range)
       device_evaluator.evalScalar(gId);

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 9961782..4a558f8 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt

@@ -173,6 +173,7 @@
     ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11")
     ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11")
     ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11")
     ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11")
   endif(EIGEN_TEST_SYCL)
   # It should be safe to always run these tests as there is some fallback code for

diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
new file mode 100644
index 0000000..9b22f1e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp

@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int Layout, typename DenseIndex>
+static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){
+
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2,2,2}});
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(1, 1, 1) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(DataType);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+  DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 3>{{2,2,2}});
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
+  sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes);
+
+  gpu_out_max.device(sycl_device) = gpu_in.argmax();
+  gpu_out_min.device(sycl_device) = gpu_in.argmin();
+
+  sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
+  sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
+
+  VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1);
+  VERIFY_IS_EQUAL(out_min(), 0);
+
+  sycl_device.deallocate(d_in);
+  sycl_device.deallocate(d_out_max);
+  sycl_device.deallocate(d_out_min);
+}
+
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device)
+{
+  DenseIndex sizeDim0=9;
+  DenseIndex sizeDim1=3;
+  DenseIndex sizeDim2=5;
+  DenseIndex sizeDim3=7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix)=(ix[dim] != 0)?-1.0:10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+
+    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+
+
+
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device)
+{
+  DenseIndex sizeDim0=9;
+  DenseIndex sizeDim1=3;
+  DenseIndex sizeDim2=5;
+  DenseIndex sizeDim3=7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix)=(ix[dim] != 0)?1.0:-10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+
+    DataType * d_in       = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out= static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(d_in, Eigen::array<DenseIndex, 4>{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+       VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+
+
+
+template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_device(const Device_Selector& d){
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+void test_cxx11_tensor_argmax_sycl() {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<double>(device));
+  }
+}
commit	f84963ed95ff277bf3abb2e2517b3017a25ccf3f	[log] [tgz]
author	Mehdi Goli <mehdi.goli@codeplay.com>	Tue Mar 07 14:27:10 2017 +0000
committer	Mehdi Goli <mehdi.goli@codeplay.com>	Tue Mar 07 14:27:10 2017 +0000
tree	b9616be8fe4f8048287a147d070288701457ea3c
parent	8296b87d7bd98c19c6064241880691f164790ede [diff]