// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H #define EIGEN_SELFADJOINT_MATRIX_VECTOR_H namespace Eigen { namespace internal { /* Optimized selfadjoint matrix * vector product: * This algorithm processes 2 columns at onces that allows to both reduce * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ template struct selfadjoint_matrix_vector_product; template struct selfadjoint_matrix_vector_product { static EIGEN_DONT_INLINE void run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha); }; template EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { typedef typename packet_traits::type Packet; const Index PacketSize = sizeof(Packet)/sizeof(Scalar); enum { IsRowMajor = StorageOrder==RowMajor ? 1 : 0, IsLower = UpLo == Lower ? 1 : 0, FirstTriangular = IsRowMajor == IsLower }; conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; conj_helper::IsComplex, ConjugateRhs> cjd; conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha; // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed. // if the rhs is not sequentially stored in memory we copy it to a temporary buffer, // this is because we need to extract packets ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast(_rhs) : 0); if (rhsIncr!=1) { const Scalar* it = _rhs; for (Index i=0; i(t0); Scalar t1 = cjAlpha * rhs[j+1]; Packet ptmp1 = pset1(t1); Scalar t2(0); Packet ptmp2 = pset1(t2); Scalar t3(0); Packet ptmp3 = pset1(t3); size_t starti = FirstTriangular ? 0 : j+2; size_t endi = FirstTriangular ? j : size; size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti); size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed res[j] += cjd.pmul(numext::real(A0[j]), t0); res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1); if(FirstTriangular) { res[j] += cj0.pmul(A1[j], t1); t3 += cj1.pmul(A1[j], rhs[j]); } else { res[j+1] += cj0.pmul(A0[j+1],t0); t2 += cj1.pmul(A0[j+1], rhs[j+1]); } for (size_t i=starti; i huge speed up) // gcc 4.2 does this optimization automatically. const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; Scalar* EIGEN_RESTRICT resIt = res + alignedStart; for (size_t i=alignedStart; i(a0It); a0It += PacketSize; Packet A1i = ploadu(a1It); a1It += PacketSize; Packet Bi = ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases Packet Xi = pload (resIt); Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); pstore(resIt,Xi); resIt += PacketSize; } for (size_t i=alignedEnd; i struct traits > : traits, Lhs, Rhs> > {}; } template struct SelfadjointProductMatrix : public ProductBase, Lhs, Rhs > { EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) enum { LhsUpLo = LhsMode&(Upper|Lower) }; SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} template void scaleAndAddTo(Dest& dest, const Scalar& alpha) const { typedef typename Dest::Scalar ResScalar; typedef typename Base::RhsScalar RhsScalar; typedef Map, Aligned> MappedDest; eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols()); typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(m_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(m_rhs); Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) * RhsBlasTraits::extractScalarFactor(m_rhs); enum { EvalToDest = (Dest::InnerStrideAtCompileTime==1), UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1) }; internal::gemv_static_vector_if static_dest; internal::gemv_static_vector_if static_rhs; ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), EvalToDest ? dest.data() : static_dest.data()); ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(), UseRhs ? const_cast(rhs.data()) : static_rhs.data()); if(!EvalToDest) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN int size = dest.size(); EIGEN_DENSE_STORAGE_CTOR_PLUGIN #endif MappedDest(actualDestPtr, dest.size()) = dest; } if(!UseRhs) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN int size = rhs.size(); EIGEN_DENSE_STORAGE_CTOR_PLUGIN #endif Map(actualRhsPtr, rhs.size()) = rhs; } internal::selfadjoint_matrix_vector_product::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run ( lhs.rows(), // size &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info actualRhsPtr, 1, // rhs info actualDestPtr, // result info actualAlpha // scale factor ); if(!EvalToDest) dest = MappedDest(actualDestPtr, dest.size()); } }; namespace internal { template struct traits > : traits, Lhs, Rhs> > {}; } template struct SelfadjointProductMatrix : public ProductBase, Lhs, Rhs > { EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) enum { RhsUpLo = RhsMode&(Upper|Lower) }; SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} template void scaleAndAddTo(Dest& dest, const Scalar& alpha) const { // let's simply transpose the product Transpose destT(dest); SelfadjointProductMatrix, int(RhsUpLo)==Upper ? Lower : Upper, false, Transpose, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha); } }; } // end namespace Eigen #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H