// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2009 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H #define EIGEN_TRIANGULAR_MATRIX_MATRIX_H namespace Eigen { namespace internal { // template // struct gemm_pack_lhs_triangular // { // Matrix::IsComplex && Conjugate> cj; // const_blas_data_mapper lhs(_lhs,lhsStride); // int count = 0; // const int peeled_mc = (rows/mr)*mr; // for(int i=0; i struct product_triangular_matrix_matrix; template struct product_triangular_matrix_matrix { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_triangular_matrix_matrix ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; // implements col-major += alpha * op(triangular) * op(general) template struct product_triangular_matrix_matrix { typedef gebp_traits Traits; enum { SmallPanelWidth = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; static EIGEN_DONT_INLINE void run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros Index diagSize = (std::min)(_rows,_depth); Index rows = IsLower ? _rows : diagSize; Index depth = IsLower ? diagSize : _depth; Index cols = _cols; typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction // The small panel size must not be larger than blocking size. // Usually this should never be the case because SmallPanelWidth^2 is very small // compared to L2 cache size, but let's be safe: Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc)); std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); // To work around an "error: member reference base type 'Matrix<...> // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is // not a structure or union" compilation error in nvcc (tested V8.0.61), // create a dummy internal::constructor_without_unaligned_array_assert // object to pass to the Matrix constructor. internal::constructor_without_unaligned_array_assert a; Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); else triangularBuffer.diagonal().setOnes(); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2rows)) { actual_kc = rows-k2; k2 = k2+actual_kc-kc; } pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols); // the selected lhs's panel has to be split in three different parts: // 1 - the part which is zero => skip it // 2 - the diagonal block => special kernel // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP // the block diagonal, if any: if(IsLower || actual_k2(actual_kc-k1, panelWidth); Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; Index startBlock = actual_k2+k1; Index blockBOffset = k1; // => GEBP with the micro triangular block // The trick is to pack this micro block while filling the opposite triangular part with zeros. // To this end we do an extra triangular copy to a small temporary buffer for (Index k=0;k0) { Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); } } } // the part below (lower case) or above (upper case) the diagonal => GEPP { Index start = IsLower ? k2 : 0; Index end = IsLower ? rows : (std::min)(actual_k2,rows); for(Index i2=start; i2() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } } // implements col-major += alpha * op(general) * op(triangular) template struct product_triangular_matrix_matrix { typedef gebp_traits Traits; enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; static EIGEN_DONT_INLINE void run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); // strip zeros Index diagSize = (std::min)(_cols,_depth); Index rows = _rows; Index depth = IsLower ? _depth : diagSize; Index cols = IsLower ? diagSize : _cols; typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); internal::constructor_without_unaligned_array_assert a; Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); else triangularBuffer.diagonal().setOnes(); gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k20; IsLower ? k2+=kc : k2-=kc) { Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc); Index actual_k2 = IsLower ? k2 : k2-actual_kc; // align blocks with the end of the triangular part for trapezoidal rhs if(IsLower && (k2cols)) { actual_kc = cols-k2; k2 = actual_k2 + actual_kc - kc; } // remaining size Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2; // size of the triangular part Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; geb = geb + internal::first_aligned(geb,PacketBytes/sizeof(Scalar)); pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs); // pack the triangular part of the rhs padding the unrolled blocks with zeros if(ts>0) { for (Index j2=0; j2(actual_kc-j2, SmallPanelWidth); Index actual_j2 = actual_k2 + j2; Index panelOffset = IsLower ? j2+actualPanelWidth : 0; Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part pack_rhs_panel(blockB+j2*actual_kc, rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); // append the triangular part via a temporary buffer for (Index j=0;j0) { for (Index j2=0; j2(actual_kc-j2, SmallPanelWidth); Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; Index blockOffset = IsLower ? j2 : 0; gebp_kernel(res.getSubMapper(i2, actual_k2 + j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, alpha, actual_kc, actual_kc, // strides blockOffset, blockOffset);// offsets } } gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2), blockA, geb, actual_mc, actual_kc, rs, alpha, -1, -1, 0, 0); } } } /*************************************************************************** * Wrapper to product_triangular_matrix_matrix ***************************************************************************/ } // end namespace internal namespace internal { template struct triangular_product_impl { template static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { typedef typename Lhs::Scalar LhsScalar; typedef typename Rhs::Scalar RhsScalar; typedef typename Dest::Scalar Scalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef typename internal::remove_all::type ActualLhsTypeCleaned; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; typedef typename internal::remove_all::type ActualRhsTypeCleaned; typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; enum { IsLower = (Mode&Lower) == Lower }; Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols()); Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows()); Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor> ::run( stripedRows, stripedCols, stripedDepth, // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info actualAlpha, blocking ); // Apply correction if the diagonal is unit and a scalar factor was nested: if ((Mode&UnitDiag)==UnitDiag) { if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) { Index diagSize = (std::min)(lhs.rows(),lhs.cols()); dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); } else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) { Index diagSize = (std::min)(rhs.rows(),rhs.cols()); dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); } } } }; } // end namespace internal } // end namespace Eigen #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H