cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore

This commit is contained in:
T.D. (Tongdong) Qiu 2020-03-13 14:21:59 +01:00
parent 5b457cbbd6
commit 581cbc6a3e
12 changed files with 763 additions and 77 deletions

View File

@ -45,6 +45,7 @@ list (APPEND MAIN_SOURCE_FILES
if(CUDA_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
endif()
# originally generated with the command:

View File

@ -240,11 +240,7 @@ protected:
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
if (use_gpu && !matrix_add_well_contributions) {
OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab");
}
bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
#else
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
@ -442,14 +438,20 @@ protected:
// tries to solve linear system
// solve_system() does nothing if Dune is selected
#if HAVE_CUDA
bdaBridge->solve_system(matrix_.get(), istlb, result);
WellContributions wellContribs;
const bool addWellContribs = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
if(!addWellContribs && use_gpu)
{
simulator_.problem().wellModel().getWellContributions(wellContribs);
}
bdaBridge->solve_system(matrix_.get(), istlb, wellContribs, result);
if (result.converged) {
// get result vector x from non-Dune backend, iff solve was successful
bdaBridge->get_result(x);
} else {
// CPU fallback, or default case for Dune
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
if (use_gpu) {
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
}

View File

@ -41,6 +41,7 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, doub
{
if (use_gpu) {
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
WellContributions::setMode(use_gpu);
}
}
#else
@ -121,7 +122,7 @@ void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector
#endif
template <class BridgeMatrix, class BridgeVector>
void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
{
#if HAVE_CUDA
@ -169,7 +170,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
switch(status) {
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
//OpmLog::info("cusparseSolver converged");
@ -210,6 +211,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat, \
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b, \
WellContributions& wellContribs, \
InverseOperatorResult &res);
template void BdaBridge::solve_system< \
@ -217,6 +219,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
WellContributions& wellContribs, \
InverseOperatorResult &res);
template void BdaBridge::solve_system< \
@ -224,6 +227,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
WellContributions& wellContribs, \
InverseOperatorResult &res);
template void BdaBridge::solve_system< \
@ -231,6 +235,7 @@ Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
WellContributions& wellContribs, \
InverseOperatorResult &res);
template void BdaBridge::get_result< \

View File

@ -26,6 +26,8 @@
#include "dune/istl/bcrsmatrix.hh"
#include <opm/simulators/linalg/matrixblock.hh>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#if HAVE_CUDA
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
#endif
@ -55,11 +57,12 @@ public:
/// Solve linear system, A*x = b
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
/// \param[in] b vector b, should be of type Dune::BlockVector
/// \param[in] result summary of solver result
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
/// \param[in] b vector b, should be of type Dune::BlockVector
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] result summary of solver result
template <class BridgeMatrix, class BridgeVector>
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
/// Get the resulting x vector
/// \param[inout] x vector x, should be of type Dune::BlockVector

View File

@ -0,0 +1,420 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cstdlib>
#include <cstring>
#include <config.h> // CMake
#ifdef __NVCC__
#include "opm/simulators/linalg/bda/cuda_header.hpp"
#include <cuda_runtime.h>
#endif
#include "opm/simulators/linalg/bda/WellContributions.hpp"
namespace Opm
{
// apply WellContributions using y -= C^T * (D^-1 * (B * x))
#if HAVE_CUDA
__global__ void apply_well_contributions( \
const double * __restrict__ Cnnzs, \
const double * __restrict__ Dnnzs, \
const double * __restrict__ Bnnzs, \
const int * __restrict__ Ccols, \
const int * __restrict__ Bcols, \
const double * __restrict__ x, \
double * __restrict__ y, \
const int dim, \
const int dim_wells, \
const unsigned int * __restrict__ val_pointers \
)
{
const int idx_b = blockIdx.x;
const int idx_t = threadIdx.x;
int idx = idx_b * blockDim.x + idx_t;
const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
const int vals_per_block = dim * dim_wells; // 12
const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
const int num_blocks_per_warp = 32/vals_per_block; // 2
const int lane = idx_t % 32;
const int c = lane % dim; // col in block
const int r = (lane / dim) % dim_wells; // row in block
const int NUM_THREADS = gridDim.x * blockDim.x;
extern __shared__ double smem[];
double * __restrict__ z1 = smem;
double * __restrict__ z2 = z1 + dim_wells;
if(idx_t < dim_wells){
z1[idx_t] = 0.0;
}
__syncthreads();
// z1 = B * x
if(idx_t < num_active_threads){
// multiply all blocks with x
double temp = 0.0;
int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks
while(b < val_size + val_pointers[idx_b]){
int colIdx = Bcols[b];
temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
b += num_blocks_per_warp;
}
// merge all blocks into 1 dim*dim_wells block
// since NORNE has only 2 parallel blocks, do not use a loop
temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
b = idx_t / vals_per_block + val_pointers[idx_b];
// merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
if(idx_t < vals_per_block){
// should be a loop as well, now only works for dim == 3
if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0
if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0
}
// write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
if(c == 0 && idx_t < vals_per_block){
z1[r] = temp;
}
}
__syncthreads();
// z2 = D^-1 * B * x = D^-1 * z1
if(idx_t < dim_wells){
double temp = 0.0;
for(int c = 0; c < dim_wells; ++c){
temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
}
z2[idx_t] = temp;
}
__syncthreads();
// y -= C^T * D^-1 * B * x
// use dim * val_size threads, each block is assigned 'dim' threads
if(idx_t < dim * val_size){
double temp = 0.0;
int b = idx_t / dim + val_pointers[idx_b];
int cc = idx_t % dim;
int colIdx = Ccols[b];
for(unsigned int c = 0; c < dim_wells; ++c){
temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
}
y[colIdx * dim + cc] -= temp;
}
}
#endif
void WellContributions::alloc_all(){
#if HAVE_CUDA
if(gpu_mode){
alloc_gpu();
}else{
alloc_cpu();
}
#else
alloc_cpu();
#endif
allocated = true;
}
void WellContributions::alloc_cpu(){
Cnnzs = new double[num_blocks * dim * dim_wells];
Dnnzs = new double[num_wells * dim_wells * dim_wells];
Bnnzs = new double[num_blocks * dim * dim_wells];
Ccols = new int[num_blocks];
Bcols = new int[num_blocks];
val_pointers = new unsigned int[num_wells + 1];
z1 = new double[dim_wells]; // B * x
z2 = new double[dim_wells]; // D^-1 * B * x
}
#if HAVE_CUDA
void WellContributions::alloc_gpu(){
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
val_pointers = new unsigned int[num_wells + 1];
cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
#endif
WellContributions::~WellContributions()
{
#if HAVE_CUDA
if(gpu_mode){
free_gpu();
}else{
free_cpu();
}
#else
free_cpu();
#endif
}
void WellContributions::free_cpu(){
delete[] Cnnzs;
delete[] Dnnzs;
delete[] Bnnzs;
delete[] Ccols;
delete[] Bcols;
delete[] val_pointers;
delete[] z1;
delete[] z2;
//delete[] Mnnzs;
}
#if HAVE_CUDA
void WellContributions::free_gpu(){
cudaFree(d_Cnnzs);
cudaFree(d_Dnnzs);
cudaFree(d_Bnnzs);
cudaFree(d_Ccols);
cudaFree(d_Bcols);
delete[] val_pointers;
cudaFree(d_val_pointers);
// cudaFree(d_z1);
// cudaFree(d_z2);
}
#endif
void WellContributions::apply(double *x, double *y){
#if HAVE_CUDA
if (gpu_mode){
apply_gpu(x, y);
}else{
apply_cpu(x, y);
}
#else
apply_cpu(x, y);
#endif
}
// Apply the WellContributions, similar to StandardWell::apply()
// y -= (C^T *(D^-1*( B*x)))
void WellContributions::apply_cpu(double *x, double *y)
{
#if 0
// Mnnzs contains a sparse matrix with a symmetric pattern
// Mrows would contain 'i*val_size' for every entry i, since every row has the same number of blocks
// Mcols are the same as Ccols, normally, there is an entry for every block, but since all rows have the same sparsity pattern, we only have to store 1 row
bool dbg = false;
for(int i = 0; i < dim*dim*val_size*val_size; ++i){
if(dbg)printf("Mnnzs[%d]: %.5e\n", i, Mnnzs[i]);
}
if(dbg)printf("row_size: %u, val_size: %u\n", row_size, val_size);
for(int r = 0; r < val_size; ++r){
for(int c = 0; c < val_size; ++c){
int colIdx = Ccols[c];
if(dbg)printf("colIdx: %d\n", colIdx);
for(int i = 0; i < dim; ++i){
double sum = 0.0;
for(int j = 0; j < dim; ++j){
sum += Mnnzs[r * dim * dim * val_size + c * dim * dim + i * dim + j] * x[colIdx * dim + j];
}
if(dbg)printf("sum: %f\n", sum);
y[colIdx * dim + i] -= sum;
}
}
}
if(dbg)exit(0);
#else
for(int wellID = 0; wellID < num_wells; ++wellID){
unsigned int val_size = val_pointers[wellID+1] - val_pointers[wellID];
// B * x
for (unsigned int i = 0; i < dim_wells; ++i) {
z1[i] = 0.0;
}
for (unsigned int i = 0; i < val_size; ++i) {
unsigned int blockID = i + val_pointers[wellID];
int colIdx = Bcols[blockID];
for (unsigned int j = 0; j < dim_wells; ++j) {
double temp = 0.0;
for (unsigned int k = 0; k < dim; ++k) {
temp += Bnnzs[blockID * dim * dim_wells + j * dim + k] * x[colIdx * dim + k];
}
z1[j] += temp;
}
}
// D^-1 * B * x
for (unsigned int i = 0; i < dim_wells; ++i) {
z2[i] = 0.0;
}
for (unsigned int j = 0; j < dim_wells; ++j) {
double temp = 0.0;
for (unsigned int k = 0; k < dim_wells; ++k) {
temp += Dnnzs[wellID * dim_wells * dim_wells + j * dim_wells + k] * z1[k];
}
z2[j] += temp;
}
// C^T * D^-1 * B * x
for (unsigned int i = 0; i < val_size; ++i) {
unsigned int blockID = i + val_pointers[wellID];
int colIdx = Ccols[blockID];
for (unsigned int j = 0; j < dim; ++j) {
double temp = 0.0;
for (unsigned int k = 0; k < dim_wells; ++k) {
temp += Cnnzs[blockID * dim * dim_wells + j + k * dim] * z2[k];
}
y[colIdx * dim + j] -= temp;
}
}
}
#endif
}
// Apply the WellContributions, similar to StandardWell::apply()
// y -= (C^T *(D^-1*( B*x)))
#if HAVE_CUDA
void WellContributions::apply_gpu(double *d_x, double *d_y)
{
int smem_size = 2 * sizeof(double) * dim_wells;
apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
}
#endif
void WellContributions::addMatrix(int idx, int *colIndices, double *values, unsigned int val_size)
{
#if HAVE_CUDA
if(gpu_mode){
addMatrix_gpu(idx, colIndices, values, val_size);
}else{
addMatrix_cpu(idx, colIndices, values, val_size);
}
#else
addMatrix_cpu(idx, colIndices, values, val_size);
#endif
if(idx == 2){
num_blocks_so_far += val_size;
}
if(idx == 2){
num_wells_so_far++;
}
}
void WellContributions::addMatrix_cpu(int idx, int *colIndices, double *values, unsigned int val_size)
{
switch (idx) {
case 0:
memcpy(Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells);
memcpy(Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size);
break;
case 1:
memcpy(Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells);
break;
case 2:
memcpy(Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells);
memcpy(Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size);
val_pointers[num_wells_so_far] = num_blocks_so_far;
if(num_wells_so_far == num_wells - 1){
val_pointers[num_wells] = num_blocks;
}
break;
case 3:
// store (C*D*B)
printf("ERROR unsupported matrix ID for WellContributions::addMatrix()\n");
exit(1);
// memcpy(Mnnzs, values, sizeof(double) * dim * dim * val_size * val_size);
// memcpy(Ccols, colIndices, sizeof(int) * val_size);
break;
default:
printf("ERROR unknown matrix ID for WellContributions::addMatrix()\n");
exit(1);
}
}
#if HAVE_CUDA
void WellContributions::addMatrix_gpu(int idx, int *colIndices, double *values, unsigned int val_size)
{
switch (idx) {
case 0:
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
break;
case 1:
cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
break;
case 2:
cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
val_pointers[num_wells_so_far] = num_blocks_so_far;
if(num_wells_so_far == num_wells - 1){
val_pointers[num_wells] = num_blocks;
}
cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);
break;
case 3:
// store (C*D*B)
printf("ERROR unsupported matrix ID for WellContributions::addMatrix()\n");
exit(1);
break;
default:
printf("ERROR unknown matrix ID for WellContributions::addMatrix()\n");
exit(1);
}
cudaCheckLastError("WellContributions::addMatrix() failed");
}
void WellContributions::setCudaStream(cudaStream_t stream_)
{
this->stream = stream_;
}
#endif
void WellContributions::addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq)
{
if(allocated){
std::cerr << "Error cannot add more sizes after allocated in WellContributions" << std::endl;
exit(1);
}
num_blocks += nnz;
dim = numEq;
dim_wells = numWellEq;
num_wells++;
}
// Default value
bool WellContributions::gpu_mode = false;
// If HAVE_CUDA is false, use_gpu must be false too
void WellContributions::setMode(bool use_gpu){
gpu_mode = use_gpu;
}
} //namespace Opm

View File

@ -0,0 +1,136 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WellContributions_H
#define WellContributions_H
#include <vector>
#if HAVE_CUDA
#include <cuda_runtime.h>
#endif
#include <config.h>
namespace Opm
{
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
class WellContributions
{
private:
static bool gpu_mode; // gpu_mode should be initialized in the ISTLSolverEbos constructor, and its value must not change afterwards
unsigned int num_blocks = 0; // total number of blocks in all wells
unsigned int dim;
unsigned int dim_wells;
unsigned int num_wells = 0;
unsigned int num_blocks_so_far = 0;
unsigned int num_wells_so_far = 0;
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
bool allocated = false;
#if HAVE_CUDA
double *d_Cnnzs = nullptr;
double *d_Dnnzs = nullptr;
double *d_Bnnzs = nullptr;
int *d_Ccols = nullptr;
int *d_Bcols = nullptr;
double *d_z1 = nullptr;
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
cudaStream_t stream;
#endif
double *Cnnzs = nullptr;
double *Dnnzs = nullptr;
double *Bnnzs = nullptr;
int *Ccols = nullptr;
int *Dcols = nullptr;
int *Bcols = nullptr;
double *z1 = nullptr; // z1 = B * x
double *z2 = nullptr; // z2 = D^-1 * B * x
/// Apply the WellContributions on CPU
void apply_cpu(double *x, double *y);
/// Allocate memory on the CPU
void alloc_cpu();
/// Free memory on the CPU
void free_cpu();
/// Same as addMatrix(), stores matrix on CPU
void addMatrix_cpu(int idx, int *colIndices, double *values, unsigned int val_size);
#if HAVE_CUDA
/// Apply all wellcontributions on GPU, performs y -= C^T * (D^-1 * (B * x))
/// Kernel is asynchronous
void apply_gpu(double *d_x, double *d_y);
/// Allocate memory on the GPU
void alloc_gpu();
/// Free memory on the GPU
void free_gpu();
/// Same as addMatrix(), stores matrix on GPU
void addMatrix_gpu(int idx, int *colIndices, double *values, unsigned int val_size);
#endif
public:
#if HAVE_CUDA
/// Set a cudaStream to be used
/// \param[in] stream the cudaStream that is used to launch the kernel in
void setCudaStream(cudaStream_t stream);
#endif
/// Create a new WellContributions, implementation is empty
WellContributions(){};
/// Destroy a WellContributions, and free memory
~WellContributions();
/// Apply all wellcontributions in this object
void apply(double *x, double *y);
/// Allocate memory for the wellcontributions
void alloc_all();
/// Indicate how large the next wellcontributions are, this function cannot be called after alloc_all() is called
void addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq);
/// Store a matrix in this object, in blocked csr format
void addMatrix(int idx, int *colIndices, double *values, unsigned int val_size);
/// Return the number of wells added to this object
unsigned int get_num_wells(){
return num_wells;
}
/// WellContributions can be applied on CPU or GPU
/// This function sets the static variable, so each WellContributions is applied on the correct hardware
static void setMode(bool use_gpu);
};
} //namespace Opm
#endif

View File

@ -38,6 +38,10 @@
#include "cusparse_v2.h"
// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0
namespace Opm
{
@ -58,7 +62,7 @@ namespace Opm
finalize();
}
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) {
void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
double t_total1, t_total2;
int n = N;
double rho = 1.0, rhop;
@ -72,7 +76,11 @@ namespace Opm
t_total1 = second();
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
if(wellContribs.get_num_wells() > 0){
wellContribs.setCudaStream(stream);
}
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
cublasDscal(cublasHandle, n, &mone, d_r, 1);
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
@ -101,15 +109,20 @@ namespace Opm
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, \
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
&one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
// apply wellContributions
if(wellContribs.get_num_wells() > 0){
wellContribs.apply(d_pw, d_v);
}
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
alpha = rho / tmp1;
@ -127,15 +140,20 @@ namespace Opm
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, &one, descr_M, \
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
// apply wellContributions
if(wellContribs.get_num_wells() > 0){
wellContribs.apply(d_s, d_t);
}
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
@ -178,8 +196,8 @@ namespace Opm
void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
this->N = N;
this->nnz = nnz;
this->BLOCK_SIZE = dim;
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
this->block_size = dim;
this->nnzb = nnz/block_size/block_size;
Nb = (N + dim - 1) / dim;
std::ostringstream out;
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
@ -229,8 +247,10 @@ namespace Opm
cusparseSetStream(cusparseHandle, stream);
cudaCheckLastError("Could not set stream to cusparse");
cudaMallocHost((void**)&x, sizeof(double) * N);
cudaCheckLastError("Could not allocate pinned host memory");
#if COPY_ROW_BY_ROW
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
cudaCheckLastError("Could not allocate pinned memory");
#endif
initialized = true;
} // end initialize()
@ -259,11 +279,10 @@ namespace Opm
cusparseDestroyMatDescr(descr_U);
cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle);
cudaHostUnregister(vals);
cudaHostUnregister(cols);
cudaHostUnregister(rows);
#if COPY_ROW_BY_ROW
cudaFreeHost(vals_contiguous);
#endif
cudaStreamDestroy(stream);
cudaFreeHost(x);
} // end finalize()
@ -274,22 +293,23 @@ namespace Opm
t1 = second();
}
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
#if COPY_ROW_BY_ROW
int sum = 0;
for(int i = 0; i < Nb; ++i){
int size_row = rows[i+1] - rows[i];
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#endif
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
this->vals = vals;
this->cols = cols;
this->rows = rows;
if (verbosity > 2) {
cudaStreamSynchronize(stream);
t2 = second();
@ -301,14 +321,25 @@ namespace Opm
// don't copy rowpointers and colindices, they stay the same
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) {
void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
double t1, t2;
if (verbosity > 2) {
t1 = second();
}
#if COPY_ROW_BY_ROW
int sum = 0;
for(int i = 0; i < Nb; ++i){
int size_row = rows[i+1] - rows[i];
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#endif
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
@ -364,11 +395,11 @@ namespace Opm
cudaCheckLastError("Could not create analysis info");
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
cudaCheckLastError();
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
@ -377,7 +408,7 @@ namespace Opm
// analysis of ilu LU decomposition
cusparseDbsrilu02_analysis(cusparseHandle, order, \
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_M, policy, d_buffer);
block_size, info_M, policy, d_buffer);
int structural_zero;
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
@ -388,11 +419,11 @@ namespace Opm
// analysis of ilu apply
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_L, policy, d_buffer);
block_size, info_L, policy, d_buffer);
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_U, policy, d_buffer);
block_size, info_U, policy, d_buffer);
cudaCheckLastError("Could not analyse level information");
if (verbosity > 2) {
@ -403,6 +434,8 @@ namespace Opm
OpmLog::info(out.str());
}
analysis_done = true;
return true;
} // end analyse_matrix()
@ -417,7 +450,8 @@ namespace Opm
d_mRows = d_bRows;
cusparseDbsrilu02(cusparseHandle, order, \
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
BLOCK_SIZE, info_M, policy, d_buffer);
block_size, info_M, policy, d_buffer);
cudaCheckLastError("Could not perform ilu decomposition");
int structural_zero;
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
@ -437,9 +471,9 @@ namespace Opm
} // end create_preconditioner()
void cusparseSolverBackend::solve_system(BdaResult &res) {
void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
// actually solve
gpu_pbicgstab(res);
gpu_pbicgstab(wellContribs, res);
cudaStreamSynchronize(stream);
cudaCheckLastError("Something went wrong during the GPU solve");
} // end solve_system()
@ -449,10 +483,6 @@ namespace Opm
// caller must be sure that x is a valid array
void cusparseSolverBackend::post_process(double *x) {
if (!initialized) {
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
}
double t1, t2;
if (verbosity > 2) {
t1 = second();
@ -472,12 +502,12 @@ namespace Opm
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) {
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) {
initialize(N, nnz, dim);
copy_system_to_gpu(vals, rows, cols, b);
}else{
update_system_on_gpu(vals, b);
update_system_on_gpu(vals, rows, b);
}
if (analysis_done == false) {
if (!analyse_matrix()) {
@ -486,7 +516,7 @@ namespace Opm
}
reset_prec_on_gpu();
if (create_preconditioner()) {
solve_system(res);
solve_system(wellContribs, res);
}else{
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
}

View File

@ -25,6 +25,7 @@
#include "cusparse_v2.h"
#include "opm/simulators/linalg/bda/BdaResult.hpp"
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm
{
@ -50,13 +51,11 @@ private:
int *d_bRows, *d_mRows;
double *d_x, *d_b, *d_r, *d_rw, *d_p;
double *d_pw, *d_s, *d_t, *d_v;
double *vals;
int *cols, *rows;
double *x, *b;
void *d_buffer;
int N, Nb, nnz, nnzb;
double *vals_contiguous;
int BLOCK_SIZE;
int block_size;
bool initialized = false;
bool analysis_done = false;
@ -70,8 +69,9 @@ private:
int verbosity = 0;
/// Solve linear system using ilu0-bicgstab
/// \param[inout] res summary of solver result
void gpu_pbicgstab(BdaResult& res);
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
@ -83,16 +83,17 @@ private:
void finalize();
/// Copy linear system to GPU
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] b input vector, contains N values
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
/// \param[in] b input vector, contains N values
void update_system_on_gpu(double *vals, double *b);
void update_system_on_gpu(double *vals, int *rows, double *b);
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
void reset_prec_on_gpu();
@ -106,8 +107,9 @@ private:
bool create_preconditioner();
/// Solve linear system
/// \param[inout] res summary of solver result
void solve_system(BdaResult &res);
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void solve_system(WellContributions& wellContribs, BdaResult &res);
public:
@ -128,16 +130,17 @@ public:
~cusparseSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] b input vector, contains N values
/// \param[inout] res summary of solver result
/// \return status code
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res);
/// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] b input vector, contains N values
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
/// Post processing after linear solve, now only copies resulting x vector back
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array

View File

@ -192,6 +192,9 @@ namespace Opm {
// subtract B*inv(D)*C * x from A*x
void apply(const BVector& x, BVector& Ax) const;
// accumulate the contributions of all Wells in the WellContributions object
void getWellContributions(WellContributions& x) const;
// apply well model with scaling of alpha
void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const;

View File

@ -919,7 +919,25 @@ namespace Opm {
}
template<typename TypeTag>
void
BlackoilWellModel<TypeTag>::
getWellContributions(WellContributions& wellContribs) const
{
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
unsigned int nnz, numEq, numWellEq;
derived->getWellSizes(nnz, numEq, numWellEq);
wellContribs.addSizes(nnz, numEq, numWellEq);
}
wellContribs.alloc_all();
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
derived->addWellContribution(wellContribs);
}
}
// Ax = Ax - alpha * C D^-1 B x

View File

@ -184,6 +184,12 @@ namespace Opm
/// r = r - C D^-1 Rw
virtual void apply(BVector& r) const override;
/// add the contribution (C, D^-1, B matrices) of this Well to the WellContributions object
void addWellContribution(WellContributions& wellContribs) const;
/// get the sizes of the C, D^-1 and B matrices, used to allocate memory in a WellContributions object
void getWellSizes(unsigned int& _nnzs, unsigned int& _numEq, unsigned int& _numWellEq) const;
/// using the solution x to recover the solution xw for wells and applying
/// xw to update Well State
virtual void recoverWellSolutionAndUpdateWellState(const BVector& x,

View File

@ -2736,6 +2736,65 @@ namespace Opm
}
template<typename TypeTag>
void
StandardWell<TypeTag>::
addWellContribution(WellContributions& wellContribs) const
{
std::vector<int> colIndices;
std::vector<double> nnzValues;
colIndices.reserve(duneB_.nonzeroes());
nnzValues.reserve(duneB_.nonzeroes()*numStaticWellEq * numEq);
// duneC
for ( auto colC = duneC_[0].begin(), endC = duneC_[0].end(); colC != endC; ++colC )
{
colIndices.emplace_back(colC.index());
for (int i = 0; i < numStaticWellEq; ++i) {
for (int j = 0; j < numEq; ++j) {
nnzValues.emplace_back((*colC)[i][j]);
}
}
}
wellContribs.addMatrix(0, colIndices.data(), nnzValues.data(), duneC_.nonzeroes());
// invDuneD
colIndices.clear();
nnzValues.clear();
colIndices.emplace_back(0);
for (int i = 0; i < numStaticWellEq; ++i)
{
for (int j = 0; j < numStaticWellEq; ++j) {
nnzValues.emplace_back(invDuneD_[0][0][i][j]);
}
}
wellContribs.addMatrix(1, colIndices.data(), nnzValues.data(), 1);
// duneB
colIndices.clear();
nnzValues.clear();
for ( auto colB = duneB_[0].begin(), endB = duneB_[0].end(); colB != endB; ++colB )
{
colIndices.emplace_back(colB.index());
for (int i = 0; i < numStaticWellEq; ++i) {
for (int j = 0; j < numEq; ++j) {
nnzValues.emplace_back((*colB)[i][j]);
}
}
}
wellContribs.addMatrix(2, colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
}
template<typename TypeTag>
void
StandardWell<TypeTag>::
getWellSizes(unsigned int& _nnzs, unsigned int& _numEq, unsigned int& _numWellEq) const
{
_nnzs = duneB_.nonzeroes();
_numEq = numEq;
_numWellEq = numStaticWellEq;
}