src/libxtp/threecenter_gwbse.cc - votca-xtp (b08fa64d-b91e-48a0-b185-c4ba9579a74a/main)

Tree @b08fa64d-b91e-48a0-b185-c4ba9579a74a/main (Download .tar.gz)

threecenter_gwbse.cc @b08fa64d-b91e-48a0-b185-c4ba9579a74a/main — raw · history · blame

/*
 *            Copyright 2009-2019 The VOTCA Development Team
 *                       (http://www.votca.org)
 *
 *      Licensed under the Apache License, Version 2.0 (the "License")
 *
 * You may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *              http://www.apache.org/licenses/LICENSE-2.0
 *
 *Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

#include <votca/xtp/aomatrix.h>
#include <votca/xtp/logger.h>
#include <votca/xtp/threecenter.h>

using std::flush;

namespace votca {
namespace xtp {

void TCMatrix_gwbse::Initialize(Index basissize, Index mmin, Index mmax,
                                Index nmin, Index nmax) {

  // here as storage indices starting from zero
  _nmin = nmin;
  _nmax = nmax;
  _ntotal = nmax - nmin + 1;
  _mmin = mmin;
  _mmax = mmax;
  _mtotal = mmax - mmin + 1;
  _auxbasissize = basissize;

  // vector has mtotal elements
  _matrix = std::vector<Eigen::MatrixXd>(
      _mtotal, Eigen::MatrixXd::Zero(_ntotal, _auxbasissize));
}

/*
 * Modify 3-center matrix elements consistent with use of symmetrized
 * Coulomb interaction using either CUDA or Openmp
 */
void TCMatrix_gwbse::MultiplyRightWithAuxMatrix(const Eigen::MatrixXd& matrix) {

  // Try to run the operation in a Nvidia GPU, otherwise is the default Openmp
  // implementation
#if defined(USE_CUDA)
  if (count_available_gpus() > 0) {
    MultiplyRightWithAuxMatrixCuda(matrix);
  } else {
    MultiplyRightWithAuxMatrixOpenMP(matrix);
  }
#else
  MultiplyRightWithAuxMatrixOpenMP(matrix);
#endif
  return;
}

/*
 * Fill the 3-center object by looping over shells of GW basis set and
 * calling FillBlock, which calculates all 3-center overlap integrals
 * associated to a particular shell, convoluted with the DFT orbital
 * coefficients
 */
void TCMatrix_gwbse::Fill(const AOBasis& gwbasis, const AOBasis& dftbasis,
                          const Eigen::MatrixXd& dft_orbitals) {
  // needed for Rebuild())
  _auxbasis = &gwbasis;
  _dftbasis = &dftbasis;
  _dft_orbitals = &dft_orbitals;

  // If cuda is enabled the dft orbitals are sent first to the cuda gpu
  // and memory in the cuda gpu is allocated for the intermediate matrices
#if defined(USE_CUDA)
  if (count_available_gpus() > 0) {
    FillAllBlocksCuda(gwbasis, dftbasis, dft_orbitals);
  } else {
    FillAllBlocksOpenMP(gwbasis, dftbasis, dft_orbitals);
  }
#else
  FillAllBlocksOpenMP(gwbasis, dftbasis, dft_orbitals);
#endif
  AOOverlap auxoverlap;
  auxoverlap.Fill(gwbasis);
  AOCoulomb auxcoulomb;
  auxcoulomb.Fill(gwbasis);
  Eigen::MatrixXd inv_sqrt = auxcoulomb.Pseudo_InvSqrt_GWBSE(auxoverlap, 5e-7);
  _removedfunctions = auxcoulomb.Removedfunctions();
  MultiplyRightWithAuxMatrix(inv_sqrt);

  return;
}

/*
 * Determines the 3-center integrals for a given shell in the GW basis
 * by calculating the 3-center overlap integral of the functions in the
 * GW shell with ALL functions in the DFT basis set (FillThreeCenterOLBlock)
 */
std::vector<Eigen::MatrixXd> TCMatrix_gwbse::ComputeSymmStorage(
    const AOShell& auxshell, const AOBasis& dftbasis) const {
  std::vector<Eigen::MatrixXd> symmstorage = std::vector<Eigen::MatrixXd>(
      auxshell.getNumFunc(),
      Eigen::MatrixXd::Zero(dftbasis.AOBasisSize(), dftbasis.AOBasisSize()));
  // alpha-loop over the "left" DFT basis function
  for (Index row = 0; row < dftbasis.getNumofShells(); row++) {

    const AOShell& shell_row = dftbasis.getShell(row);
    const Index row_start = shell_row.getStartIndex();
    // ThreecMatrix is symmetric, restrict explicit calculation to triangular
    // matrix
    for (Index col = 0; col <= row; col++) {
      const AOShell& shell_col = dftbasis.getShell(col);
      const Index col_start = shell_col.getStartIndex();

      Eigen::Tensor<double, 3> threec_block(auxshell.getNumFunc(),
                                            shell_row.getNumFunc(),
                                            shell_col.getNumFunc());
      threec_block.setZero();

      bool nonzero =
          FillThreeCenterRepBlock(threec_block, auxshell, shell_row, shell_col);
      if (nonzero) {
        for (Index aux_c = 0; aux_c < auxshell.getNumFunc(); aux_c++) {
          for (Index row_c = 0; row_c < shell_row.getNumFunc(); row_c++) {
            for (Index col_c = 0; col_c < shell_col.getNumFunc(); col_c++) {
              // symmetry
              if ((col_start + col_c) > (row_start + row_c)) {
                break;
              }
              symmstorage[aux_c](row_start + row_c, col_start + col_c) =
                  threec_block(aux_c, row_c, col_c);
            }  // ROW copy
          }    // COL copy
        }      // AUX copy
      }
    }  // gamma-loop
  }    // alpha-loop

  return symmstorage;
}

/*
 *  Convolution of the GW shell with ALL functions with the DFT orbital
 * coefficients
 */
std::vector<Eigen::MatrixXd> TCMatrix_gwbse::FillBlock(
    const std::vector<Eigen::MatrixXd>& symmstorage,
    const Eigen::MatrixXd& dft_orbitals) const {

  std::vector<Eigen::MatrixXd> block = std::vector<Eigen::MatrixXd>(
      _mtotal, Eigen::MatrixXd::Zero(_ntotal, symmstorage.size()));
  const Eigen::MatrixXd dftm =
      dft_orbitals.block(0, _mmin, dft_orbitals.rows(), _mtotal);
  const Eigen::MatrixXd dftn =
      dft_orbitals.block(0, _nmin, dft_orbitals.rows(), _ntotal);

  Index dim = static_cast<Index>(symmstorage.size());
  for (Index k = 0; k < dim; ++k) {
    const Eigen::MatrixXd& matrix = symmstorage[k];
    Eigen::MatrixXd threec_inMo =
        dftn.transpose() * matrix.selfadjointView<Eigen::Lower>() * dftm;
    for (Index i = 0; i < threec_inMo.cols(); ++i) {
      block[i].col(k) = threec_inMo.col(i);
    }
  }
  return block;
}

/*
 * Modify 3-center matrix elements consistent with use of symmetrized
 * Coulomb interaction using OPENMP parallelization
 */
void TCMatrix_gwbse::MultiplyRightWithAuxMatrixOpenMP(
    const Eigen::MatrixXd& matrix) {
  XTP_LOG(Log::info, _log)
      << TimeStamp() << " Using Default OpenMP for tensor matrix multiplication"
      << flush;
#pragma omp parallel for
  for (Index i_occ = 0; i_occ < _mtotal; i_occ++) {
    _matrix[i_occ] *= matrix;
  }
  return;
}

void TCMatrix_gwbse::FillAllBlocksOpenMP(const AOBasis& gwbasis,
                                         const AOBasis& dftbasis,
                                         const Eigen::MatrixXd& dft_orbitals) {
#pragma omp parallel for schedule(guided)  // private(_block)
  for (Index is = 0; is < gwbasis.getNumofShells(); is++) {
    const AOShell& shell = gwbasis.getShell(is);

    // Fill block for this shell (3-center overlap with _dft_basis +
    // multiplication with _dft_orbitals )
    std::vector<Eigen::MatrixXd> symmstorage =
        ComputeSymmStorage(shell, dftbasis);

    std::vector<Eigen::MatrixXd> block = FillBlock(symmstorage, dft_orbitals);

    // put into correct position
    for (Index m_level = 0; m_level < _mtotal; m_level++) {
      _matrix[m_level].block(0, shell.getStartIndex(), _ntotal,
                             shell.getNumFunc()) = block[m_level];
    }  // m-th DFT orbital
  }    // shells of GW basis set
}

#if defined(USE_CUDA)
/*
 * The Cuda device behaves like a server that is receiving matrix-matrix
 * multiplications from a single stream (an Nvidia queue) and handle them
 * in an asynchronous way. It performs the following operations when recieving a
 * request:
 *  1. Check that there is enough space for the arrays
 *  2. Allocate memory for each matrix
 *  3. Copy the matrix to the allocated space
 *  4. Perform the matrix multiplication
 *  5. Return the result matrix
 * The Cuda device knows to which memory address it needs to copy back the
 * result. see: https://docs.nvidia.com/cuda/cublas/index.html#thread-safety2
 */

/*
 * Modify 3-center matrix elements consistent with use of symmetrized
 * Coulomb interaction using OPENMP/CUDA parallelization
 */
void TCMatrix_gwbse::MultiplyRightWithAuxMatrixCuda(
    const Eigen::MatrixXd& matrix) {

  XTP_LOG(Log::info, _log)
      << TimeStamp() << " Using CUDA/OpenMP for tensor matrix multiplication"
      << flush;

  CudaPipeline cuda_pip;
  const Eigen::MatrixXd& head = _matrix.front();
  const cudaStream_t& stream = cuda_pip.get_stream();
  CudaMatrix cuma_A{head.rows(), head.cols(), stream};
  CudaMatrix cuma_B{matrix, stream};
  CudaMatrix cuma_C{head.rows(), matrix.cols(), stream};

#pragma omp parallel for schedule(dynamic)
  for (Index i_occ = 0; i_occ < _mtotal; i_occ++) {
    // All the GPU communication happens through a single thread that reuses all
    // memory allocated in the GPU and it's dynamically load-balanced by OpenMP.
    // The rest of the threads use the default CPU matrix multiplication
    if (OPENMP::getThreadId() == 0) {
      cuma_A.copy_to_gpu(_matrix[i_occ]);
      cuda_pip.gemm(cuma_A, cuma_B, cuma_C);
      _matrix[i_occ] = cuma_C;
    } else {
      _matrix[i_occ] *= matrix;
    }
  }
}

/*
 * Convolution of the GW shell with the DFT orbital coefficients using an Nvidia
 * GPU
 */
void TCMatrix_gwbse::FillAllBlocksCuda(const AOBasis& gwbasis,
                                       const AOBasis& dftbasis,
                                       const Eigen::MatrixXd& dft_orbitals) {
  CudaPipeline cuda_pip;
  std::array<CudaMatrix, 2> cuda_matrices =
      SendDFTMatricesToGPU(dft_orbitals, cuda_pip);
  std::array<CudaMatrix, 3> cuda_inter_matrices =
      CreateIntermediateCudaMatrices(dft_orbitals.rows(), cuda_pip);

  // loop over all shells in the GW basis and get _Mmn for that shell
#pragma omp parallel for schedule(dynamic)  // private(_block)
  for (Index is = 0; is < gwbasis.getNumofShells(); is++) {
    const AOShell& shell = gwbasis.getShell(is);

    // Fill block for this shell (3-center overlap with _dft_basis +
    // multiplication with _dft_orbitals )
    std::vector<Eigen::MatrixXd> symmstorage =
        ComputeSymmStorage(shell, dftbasis);

    // If cuda is enable all the GPU communication happens through a single
    // thread that reuses all memory allocated in the GPU and it's dynamically
    // load-balanced by OpenMP. The remaining threads will perform the
    // convolution using the default CPU method.
    std::vector<Eigen::MatrixXd> block;
    if (OPENMP::getThreadId() == 0) {
      block = FillBlockCUDA(symmstorage, cuda_matrices, cuda_inter_matrices,
                            cuda_pip);
    } else {
      block = FillBlock(symmstorage, dft_orbitals);
    }
    // // Otherwise the convolution is performed by Eigen
    // block = FillBlock(symmstorage, dft_orbitals);
    // put into correct position
    for (Index m_level = 0; m_level < _mtotal; m_level++) {
      _matrix[m_level].block(0, shell.getStartIndex(), _ntotal,
                             shell.getNumFunc()) = block[m_level];
    }  // m-th DFT orbital
  }    // shells of GW basis set
}

std::vector<Eigen::MatrixXd> TCMatrix_gwbse::FillBlockCUDA(
    const std::vector<Eigen::MatrixXd>& symmstorage,
    const std::array<CudaMatrix, 2>& cuda_matrices,
    std::array<CudaMatrix, 3>& cuda_inter_matrices,
    const CudaPipeline& cuda_pip) const {

  std::vector<Eigen::MatrixXd> block = std::vector<Eigen::MatrixXd>(
      _mtotal, Eigen::MatrixXd::Zero(_ntotal, symmstorage.size()));

  try {
    const CudaMatrix& cuma_A = cuda_matrices[0];
    const CudaMatrix& cuma_C = cuda_matrices[1];
    CudaMatrix& cuma_B = cuda_inter_matrices[0];
    CudaMatrix& cuma_X = cuda_inter_matrices[1];
    CudaMatrix& cuma_Y = cuda_inter_matrices[2];

    Index dim = static_cast<Index>(symmstorage.size());
    for (Index k = 0; k < dim; ++k) {
      const Eigen::MatrixXd& matrix = symmstorage[k];
      cuma_B.copy_to_gpu(matrix.selfadjointView<Eigen::Lower>());
      cuda_pip.gemm(cuma_A, cuma_B, cuma_X);
      cuda_pip.gemm(cuma_X, cuma_C, cuma_Y);
      Eigen::MatrixXd threec_inMo = cuma_Y;
      for (Index i = 0; i < threec_inMo.cols(); ++i) {
        block[i].col(k) = threec_inMo.col(i);
      }
    }
  } catch (const std::runtime_error& error) {
    XTP_LOG(Log::error, _log)
        << TimeStamp() << " FillBlockCUDA failed due to: " << error.what()
        << flush;
    throw;
  }
  return block;
}

std::array<CudaMatrix, 2> TCMatrix_gwbse::SendDFTMatricesToGPU(
    const Eigen::MatrixXd& dft_orbitals, const CudaPipeline& cuda_pip) const {
  const Eigen::MatrixXd dftm =
      dft_orbitals.block(0, _mmin, dft_orbitals.rows(), _mtotal);
  const Eigen::MatrixXd dftn =
      dft_orbitals.block(0, _nmin, dft_orbitals.rows(), _ntotal);

  // Smart Pointers to the cuda arrays
  const cudaStream_t& stream = cuda_pip.get_stream();

  return {CudaMatrix{dftn.transpose(), stream}, CudaMatrix{dftm, stream}};
}

std::array<CudaMatrix, 3> TCMatrix_gwbse::CreateIntermediateCudaMatrices(
    Index basissize, const CudaPipeline& cuda_pip) const {
  Index mcols = _mtotal - _mmin;
  Index ncols = _ntotal - _nmin;

  const cudaStream_t& stream = cuda_pip.get_stream();
  return {CudaMatrix{basissize, basissize, stream},
          CudaMatrix{ncols, basissize, stream},
          CudaMatrix{ncols, mcols, stream}};
}
#endif

}  // namespace xtp
}  // namespace votca