// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2009 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H #define EIGEN_TRIANGULAR_MATRIX_MATRIX_H namespace Eigen { namespace internal { // template // struct gemm_pack_lhs_triangular // { // Matrix::IsComplex && Conjugate> cj; // const_blas_data_mapper lhs(_lhs,lhsStride); // int count = 0; // const int peeled_mc = (rows/mr)*mr; // for(int i=0; i struct product_triangular_matrix_matrix; template struct product_triangular_matrix_matrix { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_triangular_matrix_matrix ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; // implements col-major += alpha * op(triangular) * op(general) template struct product_triangular_matrix_matrix { typedef gebp_traits Traits; enum { SmallPanelWidth = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; static EIGEN_DONT_INLINE void run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros Index diagSize = (std::min)(_rows,_depth); Index rows = IsLower ? _rows : diagSize; Index depth = IsLower ? diagSize : _depth; Index cols = _cols; const_blas_data_mapper lhs(_lhs,lhsStride); const_blas_data_mapper rhs(_rhs,rhsStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); Matrix triangularBuffer; triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); else triangularBuffer.diagonal().setOnes(); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2rows)) { actual_kc = rows-k2; k2 = k2+actual_kc-kc; } pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); // the selected lhs's panel has to be split in three different parts: // 1 - the part which is zero => skip it // 2 - the diagonal block => special kernel // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP // the block diagonal, if any: if(IsLower || actual_k2(actual_kc-k1, SmallPanelWidth); Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; Index startBlock = actual_k2+k1; Index blockBOffset = k1; // => GEBP with the micro triangular block // The trick is to pack this micro block while filling the opposite triangular part with zeros. // To this end we do an extra triangular copy to a small temporary buffer for (Index k=0;k0) { Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset, blockW); } } } // the part below (lower case) or above (upper case) the diagonal => GEPP { Index start = IsLower ? k2 : 0; Index end = IsLower ? rows : (std::min)(actual_k2,rows); for(Index i2=start; i2() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); } } } } // implements col-major += alpha * op(general) * op(triangular) template struct product_triangular_matrix_matrix { typedef gebp_traits Traits; enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; static EIGEN_DONT_INLINE void run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros Index diagSize = (std::min)(_cols,_depth); Index rows = _rows; Index depth = IsLower ? _depth : diagSize; Index cols = IsLower ? diagSize : _cols; const_blas_data_mapper lhs(_lhs,lhsStride); const_blas_data_mapper rhs(_rhs,rhsStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); Matrix triangularBuffer; triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); else triangularBuffer.diagonal().setOnes(); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k20; IsLower ? k2+=kc : k2-=kc) { Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc); Index actual_k2 = IsLower ? k2 : k2-actual_kc; // align blocks with the end of the triangular part for trapezoidal rhs if(IsLower && (k2cols)) { actual_kc = cols-k2; k2 = actual_k2 + actual_kc - kc; } // remaining size Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2; // size of the triangular part Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); // pack the triangular part of the rhs padding the unrolled blocks with zeros if(ts>0) { for (Index j2=0; j2(actual_kc-j2, SmallPanelWidth); Index actual_j2 = actual_k2 + j2; Index panelOffset = IsLower ? j2+actualPanelWidth : 0; Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part pack_rhs_panel(blockB+j2*actual_kc, &rhs(actual_k2+panelOffset, actual_j2), rhsStride, panelLength, actualPanelWidth, actual_kc, panelOffset); // append the triangular part via a temporary buffer for (Index j=0;j0) { for (Index j2=0; j2(actual_kc-j2, SmallPanelWidth); Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; Index blockOffset = IsLower ? j2 : 0; gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, alpha, actual_kc, actual_kc, // strides blockOffset, blockOffset,// offsets blockW); // workspace } } gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, blockA, geb, actual_mc, actual_kc, rs, alpha, -1, -1, 0, 0, blockW); } } } /*************************************************************************** * Wrapper to product_triangular_matrix_matrix ***************************************************************************/ template struct traits > : traits, Lhs, Rhs> > {}; } // end namespace internal template struct TriangularProduct : public ProductBase, Lhs, Rhs > { EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} template void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(m_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(m_rhs); Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) * RhsBlasTraits::extractScalarFactor(m_rhs); typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; enum { IsLower = (Mode&Lower) == Lower }; Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols()); Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows()); Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); BlockingType blocking(stripedRows, stripedCols, stripedDepth); internal::product_triangular_matrix_matrix::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor> ::run( stripedRows, stripedCols, stripedDepth, // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info actualAlpha, blocking ); } }; } // end namespace Eigen #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H