// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2009 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H #define EIGEN_SELFADJOINT_MATRIX_MATRIX_H namespace Eigen { namespace internal { // pack a selfadjoint block diagonal for use with the gebp_kernel template struct symm_pack_lhs { template inline void pack(Scalar* blockA, const const_blas_data_mapper& lhs, Index cols, Index i, Index& count) { // normal copy for(Index k=0; k::size }; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i(blockA, lhs, cols, i, count); if(Pack1>=2*PacketSize) for(Index i=peeled_mc3; i(blockA, lhs, cols, i, count); if(Pack1>=1*PacketSize) for(Index i=peeled_mc2; i(blockA, lhs, cols, i, count); // do the same with mr==1 for(Index i=peeled_mc1; i struct symm_pack_rhs { enum { PacketSize = packet_traits::size }; void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; const_blas_data_mapper rhs(_rhs,rhsStride); Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // first part: normal case for(Index j2=0; j2=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } if (nr>=8) { blockB[count+4] = rhs(k,j2+4); blockB[count+5] = rhs(k,j2+5); blockB[count+6] = rhs(k,j2+6); blockB[count+7] = rhs(k,j2+7); } count += nr; } } // second part: diagonal block Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2; if(nr>=8) { for(Index j2=k2; j2=4) { for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) { // again we can split vertically in three different parts (transpose, symmetric, normal) // transpose for(Index k=k2; k=8) { for(Index j2=k2+rows; j2=4) { for(Index j2=(std::max)(packet_cols8,k2+rows); j2 the same with nr==1) for(Index j2=packet_cols4; j2 struct product_selfadjoint_matrix; template struct product_selfadjoint_matrix { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), ColMajor> ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; template struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = rows; typedef gebp_traits Traits; typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper LhsTransposeMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); LhsTransposeMapper lhs_transpose(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction // kc must be smaller than mc kc = (std::min)(kc,mc); std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2 transposed packed copy // 2 - the diagonal block => special packed copy // 3 - the panel below the diagonal block => generic packed copy for(Index i2=0; i2() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } // matrix * selfadjoint product template struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = cols; typedef gebp_traits Traits; typedef const_blas_data_mapper LhsMapper; typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); ResMapper res(_res,resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 GEPP for(Index i2=0; i2 struct selfadjoint_product_impl { typedef typename Product::Scalar Scalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; enum { LhsIsUpper = (LhsMode&(Upper|Lower))==Upper, LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint, RhsIsUpper = (RhsMode&(Upper|Lower))==Upper, RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint }; template static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs); typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType; BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); internal::product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( lhs.rows(), rhs.cols(), // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info actualAlpha, blocking // alpha ); } }; } // end namespace internal } // end namespace Eigen #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H