Eigen/src/Core/products/CoeffBasedProduct.h - platform/external/eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_COEFFBASED_PRODUCT_H
 #define EIGEN_COEFFBASED_PRODUCT_H

 namespace Eigen {

 namespace internal {

 /*********************************************************************************
 *  Coefficient based product implementation.
 *  It is designed for the following use cases:
 *  - small fixed sizes
 *  - lazy products
 *********************************************************************************/

 /* Since the all the dimensions of the product are small, here we can rely
  * on the generic Assign mechanism to evaluate the product per coeff (or packet).
  *
  * Note that here the inner-loops should always be unrolled.
  */

 template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl;

 template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl;

 template<typename LhsNested, typename RhsNested, int NestingFlags>
 struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
 {
   typedef MatrixXpr XprKind;
   typedef typename remove_all<LhsNested>::type _LhsNested;
   typedef typename remove_all<RhsNested>::type _RhsNested;
   typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
   typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
                                            typename traits<_RhsNested>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<_LhsNested>::Index,
                                          typename traits<_RhsNested>::Index>::type Index;

   enum {
       LhsCoeffReadCost = _LhsNested::CoeffReadCost,
       RhsCoeffReadCost = _RhsNested::CoeffReadCost,
       LhsFlags = _LhsNested::Flags,
       RhsFlags = _RhsNested::Flags,

       RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
       ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
       InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),

       MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,

       LhsRowMajor = LhsFlags & RowMajorBit,
       RhsRowMajor = RhsFlags & RowMajorBit,

       SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,

       CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
                       && (ColsAtCompileTime == Dynamic
                           || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
                               && (RhsFlags&AlignedBit)
                              )
                          ),

       CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
                       && (RowsAtCompileTime == Dynamic
                           || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
                               && (LhsFlags&AlignedBit)
                              )
                          ),

       EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                      : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
                      : (RhsRowMajor && !CanVectorizeLhs),

       Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
             | (EvalToRowMajor ? RowMajorBit : 0)
             | NestingFlags
             | (LhsFlags & RhsFlags & AlignedBit)
             // TODO enable vectorization for mixed types
             | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),

       CoeffReadCost = InnerSize == Dynamic ? Dynamic
                     : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
                       + (InnerSize - 1) * NumTraits<Scalar>::AddCost,

       /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
       * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
       * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
       * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
       */
       CanVectorizeInner =    SameType
                           && LhsRowMajor
                           && (!RhsRowMajor)
                           && (LhsFlags & RhsFlags & ActualPacketAccessBit)
                           && (LhsFlags & RhsFlags & AlignedBit)
                           && (InnerSize % packet_traits<Scalar>::size == 0)
     };
 };

 } // end namespace internal

 template<typename LhsNested, typename RhsNested, int NestingFlags>
 class CoeffBasedProduct
   : internal::no_assignment_operator,
     public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
 {
   public:

     typedef MatrixBase<CoeffBasedProduct> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
     typedef typename Base::PlainObject PlainObject;

   private:

     typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
     typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;

     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
       InnerSize  = internal::traits<CoeffBasedProduct>::InnerSize,
       Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
       CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
     };

     typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
                                    Unroll ? InnerSize-1 : Dynamic,
                                    _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;

     typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;

   public:

     inline CoeffBasedProduct(const CoeffBasedProduct& other)
       : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
     {}

     template<typename Lhs, typename Rhs>
     inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
       : m_lhs(lhs), m_rhs(rhs)
     {
       // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
       // We still allow to mix T and complex<T>.
       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
       eigen_assert(lhs.cols() == rhs.rows()
         && "invalid matrix product"
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }

     EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
     EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }

     EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
     {
       Scalar res;
       ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
       return res;
     }

     /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
      * which is why we don't set the LinearAccessBit.
      */
     EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
     {
       Scalar res;
       const Index row = RowsAtCompileTime == 1 ? 0 : index;
       const Index col = RowsAtCompileTime == 1 ? index : 0;
       ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
       return res;
     }

     template<int LoadMode>
     EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
     {
       PacketScalar res;
       internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
                               Unroll ? InnerSize-1 : Dynamic,
                               _LhsNested, _RhsNested, PacketScalar, LoadMode>
         ::run(row, col, m_lhs, m_rhs, res);
       return res;
     }

     // Implicit conversion to the nested type (trigger the evaluation of the product)
     EIGEN_STRONG_INLINE operator const PlainObject& () const
     {
       m_result.lazyAssign(*this);
       return m_result;
     }

     const _LhsNested& lhs() const { return m_lhs; }
     const _RhsNested& rhs() const { return m_rhs; }

     const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
     { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

     template<int DiagonalIndex>
     const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
     { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

     const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
     { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }

   protected:
     typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
     typename internal::add_const_on_value_type<RhsNested>::type m_rhs;

     mutable PlainObject m_result;
 };

 namespace internal {

 // here we need to overload the nested rule for products
 // such that the nested type is a const reference to a plain matrix
 template<typename Lhs, typename Rhs, int N, typename PlainObject>
 struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
 {
   typedef PlainObject const& type;
 };

 /***************************************************************************
 * Normal product .coeff() implementation (with meta-unrolling)
 ***************************************************************************/

 /**************************************
 *** Scalar path  - no vectorization ***
 **************************************/

 template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
   {
     product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
     res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
   }
 };

 template<typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
   {
     res = lhs.coeff(row, 0) * rhs.coeff(0, col);
   }
 };

 template<typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
   {
     eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
     res = lhs.coeff(row, 0) * rhs.coeff(0, col);
       for(Index i = 1; i < lhs.cols(); ++i)
         res += lhs.coeff(row, i) * rhs.coeff(i, col);
   }
 };

 /*******************************************
 *** Scalar path with inner vectorization ***
 *******************************************/

 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
 struct product_coeff_vectorized_unroller
 {
   typedef typename Lhs::Index Index;
   enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
   {
     product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
     pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
   }
 };

 template<typename Lhs, typename Rhs, typename Packet>
 struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
   {
     pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
   }
 };

 template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
 {
   typedef typename Lhs::PacketScalar Packet;
   typedef typename Lhs::Index Index;
   enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
   {
     Packet pres;
     product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
     product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
     res = predux(pres);
   }
 };

 template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
 struct product_coeff_vectorized_dyn_selector
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
     res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
   }
 };

 // NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
 // NOTE maybe they are now useless since we have a specialization for Block<Matrix>
 template<typename Lhs, typename Rhs, int RhsCols>
 struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
     res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
   }
 };

 template<typename Lhs, typename Rhs, int LhsRows>
 struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
     res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
   }
 };

 template<typename Lhs, typename Rhs>
 struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
     res = lhs.transpose().cwiseProduct(rhs).sum();
   }
 };

 template<typename Lhs, typename Rhs, typename RetScalar>
 struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
     product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
   }
 };

 /*******************
 *** Packet path  ***
 *******************/

 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
     product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
     res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
   }
 };

 template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
     product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
     res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
   }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
     res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
   }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
     res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
   }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
   {
     eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
     res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
       for(Index i = 1; i < lhs.cols(); ++i)
         res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
   }
 };

 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
   {
     eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
     res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
       for(Index i = 1; i < lhs.cols(); ++i)
         res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
   }
 };

 } // end namespace internal

 } // end namespace Eigen

 #endif // EIGEN_COEFFBASED_PRODUCT_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
	// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_COEFFBASED_PRODUCT_H
	#define EIGEN_COEFFBASED_PRODUCT_H

	namespace Eigen {

	namespace internal {

	/*********************************************************************************
	* Coefficient based product implementation.
	* It is designed for the following use cases:
	* - small fixed sizes
	* - lazy products
	*********************************************************************************/

	/* Since the all the dimensions of the product are small, here we can rely
	* on the generic Assign mechanism to evaluate the product per coeff (or packet).
	*
	* Note that here the inner-loops should always be unrolled.
	*/

	template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
	struct product_coeff_impl;

	template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
	struct product_packet_impl;

	template<typename LhsNested, typename RhsNested, int NestingFlags>
	struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
	{
	typedef MatrixXpr XprKind;
	typedef typename remove_all<LhsNested>::type _LhsNested;
	typedef typename remove_all<RhsNested>::type _RhsNested;
	typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
	typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
	typename traits<_RhsNested>::StorageKind>::ret StorageKind;
	typedef typename promote_index_type<typename traits<_LhsNested>::Index,
	typename traits<_RhsNested>::Index>::type Index;

	enum {
	LhsCoeffReadCost = _LhsNested::CoeffReadCost,
	RhsCoeffReadCost = _RhsNested::CoeffReadCost,
	LhsFlags = _LhsNested::Flags,
	RhsFlags = _RhsNested::Flags,

	RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
	ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
	InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),

	MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
	MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,

	LhsRowMajor = LhsFlags & RowMajorBit,
	RhsRowMajor = RhsFlags & RowMajorBit,

	SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,

	CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
	&& (ColsAtCompileTime == Dynamic
	\|\| ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
	&& (RhsFlags&AlignedBit)
	)
	),

	CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
	&& (RowsAtCompileTime == Dynamic
	\|\| ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
	&& (LhsFlags&AlignedBit)
	)
	),

	EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
	: (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
	: (RhsRowMajor && !CanVectorizeLhs),

	Flags = ((unsigned int)(LhsFlags \| RhsFlags) & HereditaryBits & ~RowMajorBit)
	\| (EvalToRowMajor ? RowMajorBit : 0)
	\| NestingFlags
	\| (LhsFlags & RhsFlags & AlignedBit)
	// TODO enable vectorization for mixed types
	\| (SameType && (CanVectorizeLhs \|\| CanVectorizeRhs) ? PacketAccessBit : 0),

	CoeffReadCost = InnerSize == Dynamic ? Dynamic
	: InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
	+ (InnerSize - 1) * NumTraits<Scalar>::AddCost,

	/* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
	* of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
	* loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
	* the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
	*/
	CanVectorizeInner = SameType
	&& LhsRowMajor
	&& (!RhsRowMajor)
	&& (LhsFlags & RhsFlags & ActualPacketAccessBit)
	&& (LhsFlags & RhsFlags & AlignedBit)
	&& (InnerSize % packet_traits<Scalar>::size == 0)
	};
	};

	} // end namespace internal

	template<typename LhsNested, typename RhsNested, int NestingFlags>
	class CoeffBasedProduct
	: internal::no_assignment_operator,
	public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
	{
	public:

	typedef MatrixBase<CoeffBasedProduct> Base;
	EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
	typedef typename Base::PlainObject PlainObject;

	private:

	typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
	typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;

	enum {
	PacketSize = internal::packet_traits<Scalar>::size,
	InnerSize = internal::traits<CoeffBasedProduct>::InnerSize,
	Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
	CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
	};

	typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
	Unroll ? InnerSize-1 : Dynamic,
	_LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;

	typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;

	public:

	inline CoeffBasedProduct(const CoeffBasedProduct& other)
	: Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
	{}

	template<typename Lhs, typename Rhs>
	inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
	: m_lhs(lhs), m_rhs(rhs)
	{
	// we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
	// We still allow to mix T and complex<T>.
	EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
	YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
	eigen_assert(lhs.cols() == rhs.rows()
	&& "invalid matrix product"
	&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
	}

	EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
	EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }

	EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
	{
	Scalar res;
	ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
	return res;
	}

	/* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
	* which is why we don't set the LinearAccessBit.
	*/
	EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
	{
	Scalar res;
	const Index row = RowsAtCompileTime == 1 ? 0 : index;
	const Index col = RowsAtCompileTime == 1 ? index : 0;
	ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
	return res;
	}

	template<int LoadMode>
	EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
	{
	PacketScalar res;
	internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
	Unroll ? InnerSize-1 : Dynamic,
	_LhsNested, _RhsNested, PacketScalar, LoadMode>
	::run(row, col, m_lhs, m_rhs, res);
	return res;
	}

	// Implicit conversion to the nested type (trigger the evaluation of the product)
	EIGEN_STRONG_INLINE operator const PlainObject& () const
	{
	m_result.lazyAssign(*this);
	return m_result;
	}

	const _LhsNested& lhs() const { return m_lhs; }
	const _RhsNested& rhs() const { return m_rhs; }

	const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
	{ return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

	template<int DiagonalIndex>
	const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
	{ return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }

	const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
	{ return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }

	protected:
	typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
	typename internal::add_const_on_value_type<RhsNested>::type m_rhs;

	mutable PlainObject m_result;
	};

	namespace internal {

	// here we need to overload the nested rule for products
	// such that the nested type is a const reference to a plain matrix
	template<typename Lhs, typename Rhs, int N, typename PlainObject>
	struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit\|EvalBeforeAssigningBit>, N, PlainObject>
	{
	typedef PlainObject const& type;
	};

	/***************************************************************************
	* Normal product .coeff() implementation (with meta-unrolling)
	***************************************************************************/

	/**************************************
	* Scalar path - no vectorization *
	**************************************/

	template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
	struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
	{
	product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
	res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
	}
	};

	template<typename Lhs, typename Rhs, typename RetScalar>
	struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
	{
	res = lhs.coeff(row, 0) * rhs.coeff(0, col);
	}
	};

	template<typename Lhs, typename Rhs, typename RetScalar>
	struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
	{
	eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
	res = lhs.coeff(row, 0) * rhs.coeff(0, col);
	for(Index i = 1; i < lhs.cols(); ++i)
	res += lhs.coeff(row, i) * rhs.coeff(i, col);
	}
	};

	/*******************************************
	* Scalar path with inner vectorization *
	*******************************************/

	template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
	struct product_coeff_vectorized_unroller
	{
	typedef typename Lhs::Index Index;
	enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
	{
	product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
	pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
	}
	};

	template<typename Lhs, typename Rhs, typename Packet>
	struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
	{
	pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
	}
	};

	template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
	struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
	{
	typedef typename Lhs::PacketScalar Packet;
	typedef typename Lhs::Index Index;
	enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
	{
	Packet pres;
	product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
	product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
	res = predux(pres);
	}
	};

	template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
	struct product_coeff_vectorized_dyn_selector
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
	{
	res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
	}
	};

	// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
	// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
	template<typename Lhs, typename Rhs, int RhsCols>
	struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index /row/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
	{
	res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
	}
	};

	template<typename Lhs, typename Rhs, int LhsRows>
	struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index /col/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
	{
	res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
	}
	};

	template<typename Lhs, typename Rhs>
	struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index /row/, Index /col/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
	{
	res = lhs.transpose().cwiseProduct(rhs).sum();
	}
	};

	template<typename Lhs, typename Rhs, typename RetScalar>
	struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
	{
	product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
	}
	};

	/*******************
	* Packet path *
	*******************/

	template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
	struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
	{
	product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
	res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
	}
	};

	template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
	struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
	{
	product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
	res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
	}
	};

	template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
	struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
	{
	res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
	}
	};

	template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
	struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
	{
	res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
	}
	};

	template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
	struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
	{
	eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
	res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
	for(Index i = 1; i < lhs.cols(); ++i)
	res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
	}
	};

	template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
	struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
	{
	typedef typename Lhs::Index Index;
	static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
	{
	eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
	res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
	for(Index i = 1; i < lhs.cols(); ++i)
	res = pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
	}
	};

	} // end namespace internal

	} // end namespace Eigen

	#endif // EIGEN_COEFFBASED_PRODUCT_H