From 581cbc6a3e4eebf3f12820845b7eb1d58c6ea9bc Mon Sep 17 00:00:00 2001 From: "T.D. (Tongdong) Qiu" Date: Fri, 13 Mar 2020 14:21:59 +0100 Subject: [PATCH] cusparseSolver can now apply wellcontributions separately, so --matrix-add-wellcontributions=true is not required anymore --- CMakeLists_files.cmake | 1 + opm/simulators/linalg/ISTLSolverEbos.hpp | 14 +- opm/simulators/linalg/bda/BdaBridge.cpp | 9 +- opm/simulators/linalg/bda/BdaBridge.hpp | 11 +- .../linalg/bda/WellContributions.cu | 420 ++++++++++++++++++ .../linalg/bda/WellContributions.hpp | 136 ++++++ .../linalg/bda/cusparseSolverBackend.cu | 116 +++-- .../linalg/bda/cusparseSolverBackend.hpp | 45 +- opm/simulators/wells/BlackoilWellModel.hpp | 3 + .../wells/BlackoilWellModel_impl.hpp | 20 +- opm/simulators/wells/StandardWell.hpp | 6 + opm/simulators/wells/StandardWell_impl.hpp | 59 +++ 12 files changed, 763 insertions(+), 77 deletions(-) create mode 100644 opm/simulators/linalg/bda/WellContributions.cu create mode 100644 opm/simulators/linalg/bda/WellContributions.hpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index ac42c705a..389c72fef 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -45,6 +45,7 @@ list (APPEND MAIN_SOURCE_FILES if(CUDA_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu) endif() # originally generated with the command: diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index b3ce39320..08d1b022b 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -240,11 +240,7 @@ protected: const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu); const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter); const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction); - const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); const int linear_solver_verbosity = parameters_.linear_solver_verbosity_; - if (use_gpu && !matrix_add_well_contributions) { - OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab"); - } bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance)); #else const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu); @@ -442,14 +438,20 @@ protected: // tries to solve linear system // solve_system() does nothing if Dune is selected #if HAVE_CUDA - bdaBridge->solve_system(matrix_.get(), istlb, result); + WellContributions wellContribs; + const bool addWellContribs = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); + const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu); + if(!addWellContribs && use_gpu) + { + simulator_.problem().wellModel().getWellContributions(wellContribs); + } + bdaBridge->solve_system(matrix_.get(), istlb, wellContribs, result); if (result.converged) { // get result vector x from non-Dune backend, iff solve was successful bdaBridge->get_result(x); } else { // CPU fallback, or default case for Dune - const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu); if (use_gpu) { OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system..."); } diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index a78eb3084..3cae84b08 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -41,6 +41,7 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, doub { if (use_gpu) { backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance)); + WellContributions::setMode(use_gpu); } } #else @@ -121,7 +122,7 @@ void getSparsityPattern(BridgeMatrix& mat, std::vector &h_rows, std::vector #endif template -void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED) +void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED) { #if HAVE_CUDA @@ -169,7 +170,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus; // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour - cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), result); + cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); switch(status) { case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS: //OpmLog::info("cusparseSolver converged"); @@ -210,6 +211,7 @@ Dune::BCRSMatrix, std::allocator, std::allocator > > > \ (Dune::BCRSMatrix, std::allocator > > *mat, \ Dune::BlockVector, std::allocator > > &b, \ + WellContributions& wellContribs, \ InverseOperatorResult &res); template void BdaBridge::solve_system< \ @@ -217,6 +219,7 @@ Dune::BCRSMatrix, std::allocator, std::allocator > > > \ (Dune::BCRSMatrix, std::allocator > > *mat, \ Dune::BlockVector, std::allocator > > &b, \ + WellContributions& wellContribs, \ InverseOperatorResult &res); template void BdaBridge::solve_system< \ @@ -224,6 +227,7 @@ Dune::BCRSMatrix, std::allocator, std::allocator > > > \ (Dune::BCRSMatrix, std::allocator > > *mat, \ Dune::BlockVector, std::allocator > > &b, \ + WellContributions& wellContribs, \ InverseOperatorResult &res); template void BdaBridge::solve_system< \ @@ -231,6 +235,7 @@ Dune::BCRSMatrix, std::allocator, std::allocator > > > \ (Dune::BCRSMatrix, std::allocator > > *mat, \ Dune::BlockVector, std::allocator > > &b, \ + WellContributions& wellContribs, \ InverseOperatorResult &res); template void BdaBridge::get_result< \ diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index d1ae954e8..a0f318e6e 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -26,6 +26,8 @@ #include "dune/istl/bcrsmatrix.hh" #include +#include + #if HAVE_CUDA #include #endif @@ -55,11 +57,12 @@ public: /// Solve linear system, A*x = b - /// \param[in] mat matrix A, should be of type Dune::BCRSMatrix - /// \param[in] b vector b, should be of type Dune::BlockVector - /// \param[in] result summary of solver result + /// \param[in] mat matrix A, should be of type Dune::BCRSMatrix + /// \param[in] b vector b, should be of type Dune::BlockVector + /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] result summary of solver result template - void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result); + void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result); /// Get the resulting x vector /// \param[inout] x vector x, should be of type Dune::BlockVector diff --git a/opm/simulators/linalg/bda/WellContributions.cu b/opm/simulators/linalg/bda/WellContributions.cu new file mode 100644 index 000000000..355860952 --- /dev/null +++ b/opm/simulators/linalg/bda/WellContributions.cu @@ -0,0 +1,420 @@ +/* + Copyright 2019 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + + +#include +#include +#include // CMake + +#ifdef __NVCC__ +#include "opm/simulators/linalg/bda/cuda_header.hpp" +#include +#endif + +#include "opm/simulators/linalg/bda/WellContributions.hpp" + +namespace Opm +{ + + // apply WellContributions using y -= C^T * (D^-1 * (B * x)) +#if HAVE_CUDA + __global__ void apply_well_contributions( \ + const double * __restrict__ Cnnzs, \ + const double * __restrict__ Dnnzs, \ + const double * __restrict__ Bnnzs, \ + const int * __restrict__ Ccols, \ + const int * __restrict__ Bcols, \ + const double * __restrict__ x, \ + double * __restrict__ y, \ + const int dim, \ + const int dim_wells, \ + const unsigned int * __restrict__ val_pointers \ + ) + { + const int idx_b = blockIdx.x; + const int idx_t = threadIdx.x; + int idx = idx_b * blockDim.x + idx_t; + const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b]; + + const int vals_per_block = dim * dim_wells; // 12 + const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24 + const int num_blocks_per_warp = 32/vals_per_block; // 2 + const int lane = idx_t % 32; + const int c = lane % dim; // col in block + const int r = (lane / dim) % dim_wells; // row in block + const int NUM_THREADS = gridDim.x * blockDim.x; + + extern __shared__ double smem[]; + double * __restrict__ z1 = smem; + double * __restrict__ z2 = z1 + dim_wells; + + if(idx_t < dim_wells){ + z1[idx_t] = 0.0; + } + + __syncthreads(); + + // z1 = B * x + if(idx_t < num_active_threads){ + // multiply all blocks with x + double temp = 0.0; + int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks + while(b < val_size + val_pointers[idx_b]){ + int colIdx = Bcols[b]; + temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c]; + b += num_blocks_per_warp; + } + + // merge all blocks into 1 dim*dim_wells block + // since NORNE has only 2 parallel blocks, do not use a loop + temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells); + + b = idx_t / vals_per_block + val_pointers[idx_b]; + + // merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD + if(idx_t < vals_per_block){ + // should be a loop as well, now only works for dim == 3 + if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0 + if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0 + } + + // write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether + if(c == 0 && idx_t < vals_per_block){ + z1[r] = temp; + } + } + + __syncthreads(); + + // z2 = D^-1 * B * x = D^-1 * z1 + if(idx_t < dim_wells){ + double temp = 0.0; + for(int c = 0; c < dim_wells; ++c){ + temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c]; + } + z2[idx_t] = temp; + } + + __syncthreads(); + + // y -= C^T * D^-1 * B * x + // use dim * val_size threads, each block is assigned 'dim' threads + if(idx_t < dim * val_size){ + double temp = 0.0; + int b = idx_t / dim + val_pointers[idx_b]; + int cc = idx_t % dim; + int colIdx = Ccols[b]; + for(unsigned int c = 0; c < dim_wells; ++c){ + temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c]; + } + y[colIdx * dim + cc] -= temp; + } + + } +#endif + + void WellContributions::alloc_all(){ +#if HAVE_CUDA + if(gpu_mode){ + alloc_gpu(); + }else{ + alloc_cpu(); + } +#else + alloc_cpu(); +#endif + allocated = true; + } + + void WellContributions::alloc_cpu(){ + Cnnzs = new double[num_blocks * dim * dim_wells]; + Dnnzs = new double[num_wells * dim_wells * dim_wells]; + Bnnzs = new double[num_blocks * dim * dim_wells]; + Ccols = new int[num_blocks]; + Bcols = new int[num_blocks]; + val_pointers = new unsigned int[num_wells + 1]; + z1 = new double[dim_wells]; // B * x + z2 = new double[dim_wells]; // D^-1 * B * x + } + +#if HAVE_CUDA + void WellContributions::alloc_gpu(){ + cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells); + cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells); + cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells); + cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks); + cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks); + val_pointers = new unsigned int[num_wells + 1]; + cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1)); + cudaCheckLastError("apply_gpu malloc failed"); + } +#endif + + WellContributions::~WellContributions() + { +#if HAVE_CUDA + if(gpu_mode){ + free_gpu(); + }else{ + free_cpu(); + } +#else + free_cpu(); +#endif + } + + void WellContributions::free_cpu(){ + delete[] Cnnzs; + delete[] Dnnzs; + delete[] Bnnzs; + delete[] Ccols; + delete[] Bcols; + delete[] val_pointers; + delete[] z1; + delete[] z2; + //delete[] Mnnzs; + } + +#if HAVE_CUDA + void WellContributions::free_gpu(){ + cudaFree(d_Cnnzs); + cudaFree(d_Dnnzs); + cudaFree(d_Bnnzs); + cudaFree(d_Ccols); + cudaFree(d_Bcols); + delete[] val_pointers; + cudaFree(d_val_pointers); + // cudaFree(d_z1); + // cudaFree(d_z2); + } +#endif + + + void WellContributions::apply(double *x, double *y){ +#if HAVE_CUDA + if (gpu_mode){ + apply_gpu(x, y); + }else{ + apply_cpu(x, y); + } +#else + apply_cpu(x, y); +#endif + } + + // Apply the WellContributions, similar to StandardWell::apply() + // y -= (C^T *(D^-1*( B*x))) + void WellContributions::apply_cpu(double *x, double *y) + { +#if 0 + // Mnnzs contains a sparse matrix with a symmetric pattern + // Mrows would contain 'i*val_size' for every entry i, since every row has the same number of blocks + // Mcols are the same as Ccols, normally, there is an entry for every block, but since all rows have the same sparsity pattern, we only have to store 1 row + bool dbg = false; + for(int i = 0; i < dim*dim*val_size*val_size; ++i){ + if(dbg)printf("Mnnzs[%d]: %.5e\n", i, Mnnzs[i]); + } + if(dbg)printf("row_size: %u, val_size: %u\n", row_size, val_size); + for(int r = 0; r < val_size; ++r){ + for(int c = 0; c < val_size; ++c){ + int colIdx = Ccols[c]; + if(dbg)printf("colIdx: %d\n", colIdx); + for(int i = 0; i < dim; ++i){ + double sum = 0.0; + for(int j = 0; j < dim; ++j){ + sum += Mnnzs[r * dim * dim * val_size + c * dim * dim + i * dim + j] * x[colIdx * dim + j]; + } + if(dbg)printf("sum: %f\n", sum); + y[colIdx * dim + i] -= sum; + } + } + } + if(dbg)exit(0); +#else + for(int wellID = 0; wellID < num_wells; ++wellID){ + unsigned int val_size = val_pointers[wellID+1] - val_pointers[wellID]; + + // B * x + for (unsigned int i = 0; i < dim_wells; ++i) { + z1[i] = 0.0; + } + for (unsigned int i = 0; i < val_size; ++i) { + unsigned int blockID = i + val_pointers[wellID]; + int colIdx = Bcols[blockID]; + for (unsigned int j = 0; j < dim_wells; ++j) { + double temp = 0.0; + for (unsigned int k = 0; k < dim; ++k) { + temp += Bnnzs[blockID * dim * dim_wells + j * dim + k] * x[colIdx * dim + k]; + } + z1[j] += temp; + } + } + + // D^-1 * B * x + for (unsigned int i = 0; i < dim_wells; ++i) { + z2[i] = 0.0; + } + for (unsigned int j = 0; j < dim_wells; ++j) { + double temp = 0.0; + for (unsigned int k = 0; k < dim_wells; ++k) { + temp += Dnnzs[wellID * dim_wells * dim_wells + j * dim_wells + k] * z1[k]; + } + z2[j] += temp; + } + + // C^T * D^-1 * B * x + for (unsigned int i = 0; i < val_size; ++i) { + unsigned int blockID = i + val_pointers[wellID]; + int colIdx = Ccols[blockID]; + for (unsigned int j = 0; j < dim; ++j) { + double temp = 0.0; + for (unsigned int k = 0; k < dim_wells; ++k) { + temp += Cnnzs[blockID * dim * dim_wells + j + k * dim] * z2[k]; + } + y[colIdx * dim + j] -= temp; + } + } + } +#endif + } + + + // Apply the WellContributions, similar to StandardWell::apply() + // y -= (C^T *(D^-1*( B*x))) +#if HAVE_CUDA + void WellContributions::apply_gpu(double *d_x, double *d_y) + { + int smem_size = 2 * sizeof(double) * dim_wells; + apply_well_contributions<<>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers); + } +#endif + + void WellContributions::addMatrix(int idx, int *colIndices, double *values, unsigned int val_size) + { +#if HAVE_CUDA + if(gpu_mode){ + addMatrix_gpu(idx, colIndices, values, val_size); + }else{ + addMatrix_cpu(idx, colIndices, values, val_size); + } +#else + addMatrix_cpu(idx, colIndices, values, val_size); +#endif + if(idx == 2){ + num_blocks_so_far += val_size; + } + if(idx == 2){ + num_wells_so_far++; + } + } + + + void WellContributions::addMatrix_cpu(int idx, int *colIndices, double *values, unsigned int val_size) + { + switch (idx) { + case 0: + memcpy(Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells); + memcpy(Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size); + break; + case 1: + memcpy(Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells); + break; + case 2: + memcpy(Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells); + memcpy(Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size); + val_pointers[num_wells_so_far] = num_blocks_so_far; + if(num_wells_so_far == num_wells - 1){ + val_pointers[num_wells] = num_blocks; + } + break; + case 3: + // store (C*D*B) + printf("ERROR unsupported matrix ID for WellContributions::addMatrix()\n"); + exit(1); + // memcpy(Mnnzs, values, sizeof(double) * dim * dim * val_size * val_size); + // memcpy(Ccols, colIndices, sizeof(int) * val_size); + break; + default: + printf("ERROR unknown matrix ID for WellContributions::addMatrix()\n"); + exit(1); + } + } + + +#if HAVE_CUDA + void WellContributions::addMatrix_gpu(int idx, int *colIndices, double *values, unsigned int val_size) + { + switch (idx) { + case 0: + cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice); + cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice); + break; + case 1: + cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice); + break; + case 2: + cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice); + cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice); + val_pointers[num_wells_so_far] = num_blocks_so_far; + if(num_wells_so_far == num_wells - 1){ + val_pointers[num_wells] = num_blocks; + } + cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice); + break; + case 3: + // store (C*D*B) + printf("ERROR unsupported matrix ID for WellContributions::addMatrix()\n"); + exit(1); + break; + default: + printf("ERROR unknown matrix ID for WellContributions::addMatrix()\n"); + exit(1); + } + cudaCheckLastError("WellContributions::addMatrix() failed"); + } + + void WellContributions::setCudaStream(cudaStream_t stream_) + { + this->stream = stream_; + } + +#endif + + void WellContributions::addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq) + { + if(allocated){ + std::cerr << "Error cannot add more sizes after allocated in WellContributions" << std::endl; + exit(1); + } + num_blocks += nnz; + dim = numEq; + dim_wells = numWellEq; + num_wells++; + } + + // Default value + bool WellContributions::gpu_mode = false; + + // If HAVE_CUDA is false, use_gpu must be false too + void WellContributions::setMode(bool use_gpu){ + gpu_mode = use_gpu; + } + +} //namespace Opm + diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp new file mode 100644 index 000000000..07773971b --- /dev/null +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -0,0 +1,136 @@ +/* + Copyright 2019 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef WellContributions_H +#define WellContributions_H + +#include + +#if HAVE_CUDA +#include +#endif + +#include + +namespace Opm +{ + + /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver + /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used + class WellContributions + { + + private: + static bool gpu_mode; // gpu_mode should be initialized in the ISTLSolverEbos constructor, and its value must not change afterwards + unsigned int num_blocks = 0; // total number of blocks in all wells + unsigned int dim; + unsigned int dim_wells; + unsigned int num_wells = 0; + unsigned int num_blocks_so_far = 0; + unsigned int num_wells_so_far = 0; + unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols + bool allocated = false; + +#if HAVE_CUDA + double *d_Cnnzs = nullptr; + double *d_Dnnzs = nullptr; + double *d_Bnnzs = nullptr; + int *d_Ccols = nullptr; + int *d_Bcols = nullptr; + double *d_z1 = nullptr; + double *d_z2 = nullptr; + unsigned int *d_val_pointers = nullptr; + cudaStream_t stream; +#endif + + double *Cnnzs = nullptr; + double *Dnnzs = nullptr; + double *Bnnzs = nullptr; + int *Ccols = nullptr; + int *Dcols = nullptr; + int *Bcols = nullptr; + double *z1 = nullptr; // z1 = B * x + double *z2 = nullptr; // z2 = D^-1 * B * x + + /// Apply the WellContributions on CPU + void apply_cpu(double *x, double *y); + + /// Allocate memory on the CPU + void alloc_cpu(); + + /// Free memory on the CPU + void free_cpu(); + + /// Same as addMatrix(), stores matrix on CPU + void addMatrix_cpu(int idx, int *colIndices, double *values, unsigned int val_size); + +#if HAVE_CUDA + /// Apply all wellcontributions on GPU, performs y -= C^T * (D^-1 * (B * x)) + /// Kernel is asynchronous + void apply_gpu(double *d_x, double *d_y); + + /// Allocate memory on the GPU + void alloc_gpu(); + + /// Free memory on the GPU + void free_gpu(); + + /// Same as addMatrix(), stores matrix on GPU + void addMatrix_gpu(int idx, int *colIndices, double *values, unsigned int val_size); +#endif + + public: +#if HAVE_CUDA + /// Set a cudaStream to be used + /// \param[in] stream the cudaStream that is used to launch the kernel in + void setCudaStream(cudaStream_t stream); +#endif + + /// Create a new WellContributions, implementation is empty + WellContributions(){}; + + /// Destroy a WellContributions, and free memory + ~WellContributions(); + + /// Apply all wellcontributions in this object + void apply(double *x, double *y); + + /// Allocate memory for the wellcontributions + void alloc_all(); + + /// Indicate how large the next wellcontributions are, this function cannot be called after alloc_all() is called + void addSizes(unsigned int nnz, unsigned int numEq, unsigned int numWellEq); + + /// Store a matrix in this object, in blocked csr format + void addMatrix(int idx, int *colIndices, double *values, unsigned int val_size); + + /// Return the number of wells added to this object + unsigned int get_num_wells(){ + return num_wells; + } + + /// WellContributions can be applied on CPU or GPU + /// This function sets the static variable, so each WellContributions is applied on the correct hardware + static void setMode(bool use_gpu); + + }; + +} //namespace Opm + +#endif diff --git a/opm/simulators/linalg/bda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cusparseSolverBackend.cu index 547840784..9425d1136 100644 --- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu +++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu @@ -38,6 +38,10 @@ #include "cusparse_v2.h" // For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html +// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done +// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough +#define COPY_ROW_BY_ROW 0 + namespace Opm { @@ -58,7 +62,7 @@ namespace Opm finalize(); } - void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) { + void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { double t_total1, t_total2; int n = N; double rho = 1.0, rhop; @@ -72,7 +76,11 @@ namespace Opm t_total1 = second(); - cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r); + if(wellContribs.get_num_wells() > 0){ + wellContribs.setCudaStream(stream); + } + + cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r); cublasDscal(cublasHandle, n, &mone, d_r, 1); cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1); @@ -101,15 +109,20 @@ namespace Opm // apply ilu0 cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ - descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer); + descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer); cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ - descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer); + descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer); // spmv cusparseDbsrmv(cusparseHandle, order, \ operation, Nb, Nb, nnzb, \ - &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v); + &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v); + + // apply wellContributions + if(wellContribs.get_num_wells() > 0){ + wellContribs.apply(d_pw, d_v); + } cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1); alpha = rho / tmp1; @@ -127,15 +140,20 @@ namespace Opm // apply ilu0 cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ - descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer); + descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer); cusparseDbsrsv2_solve(cusparseHandle, order, \ operation, Nb, nnzb, &one, \ - descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer); + descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer); // spmv cusparseDbsrmv(cusparseHandle, order, \ operation, Nb, Nb, nnzb, &one, descr_M, \ - d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t); + d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t); + + // apply wellContributions + if(wellContribs.get_num_wells() > 0){ + wellContribs.apply(d_s, d_t); + } cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1); cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2); @@ -178,8 +196,8 @@ namespace Opm void cusparseSolverBackend::initialize(int N, int nnz, int dim) { this->N = N; this->nnz = nnz; - this->BLOCK_SIZE = dim; - this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE; + this->block_size = dim; + this->nnzb = nnz/block_size/block_size; Nb = (N + dim - 1) / dim; std::ostringstream out; out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks"; @@ -229,8 +247,10 @@ namespace Opm cusparseSetStream(cusparseHandle, stream); cudaCheckLastError("Could not set stream to cusparse"); - cudaMallocHost((void**)&x, sizeof(double) * N); - cudaCheckLastError("Could not allocate pinned host memory"); +#if COPY_ROW_BY_ROW + cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz); + cudaCheckLastError("Could not allocate pinned memory"); +#endif initialized = true; } // end initialize() @@ -259,11 +279,10 @@ namespace Opm cusparseDestroyMatDescr(descr_U); cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); - cudaHostUnregister(vals); - cudaHostUnregister(cols); - cudaHostUnregister(rows); +#if COPY_ROW_BY_ROW + cudaFreeHost(vals_contiguous); +#endif cudaStreamDestroy(stream); - cudaFreeHost(x); } // end finalize() @@ -274,22 +293,23 @@ namespace Opm t1 = second(); } - // information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c - // possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory - cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault); - cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault); - cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault); - cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault); +#if COPY_ROW_BY_ROW + int sum = 0; + for(int i = 0; i < Nb; ++i){ + int size_row = rows[i+1] - rows[i]; + memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size); + sum += size_row * block_size * block_size; + } + cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); +#else cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); +#endif + cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); - this->vals = vals; - this->cols = cols; - this->rows = rows; - if (verbosity > 2) { cudaStreamSynchronize(stream); t2 = second(); @@ -301,14 +321,25 @@ namespace Opm // don't copy rowpointers and colindices, they stay the same - void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) { + void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) { double t1, t2; if (verbosity > 2) { t1 = second(); } +#if COPY_ROW_BY_ROW + int sum = 0; + for(int i = 0; i < Nb; ++i){ + int size_row = rows[i+1] - rows[i]; + memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size); + sum += size_row * block_size * block_size; + } + cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); +#else cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); +#endif + cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); @@ -364,11 +395,11 @@ namespace Opm cudaCheckLastError("Could not create analysis info"); cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb, - descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M); + descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M); cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, - descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L); + descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L); cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, - descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U); + descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U); cudaCheckLastError(); d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U)); @@ -377,7 +408,7 @@ namespace Opm // analysis of ilu LU decomposition cusparseDbsrilu02_analysis(cusparseHandle, order, \ Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \ - BLOCK_SIZE, info_M, policy, d_buffer); + block_size, info_M, policy, d_buffer); int structural_zero; cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero); @@ -388,11 +419,11 @@ namespace Opm // analysis of ilu apply cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \ - BLOCK_SIZE, info_L, policy, d_buffer); + block_size, info_L, policy, d_buffer); cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \ - BLOCK_SIZE, info_U, policy, d_buffer); + block_size, info_U, policy, d_buffer); cudaCheckLastError("Could not analyse level information"); if (verbosity > 2) { @@ -403,6 +434,8 @@ namespace Opm OpmLog::info(out.str()); } + analysis_done = true; + return true; } // end analyse_matrix() @@ -417,7 +450,8 @@ namespace Opm d_mRows = d_bRows; cusparseDbsrilu02(cusparseHandle, order, \ Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \ - BLOCK_SIZE, info_M, policy, d_buffer); + block_size, info_M, policy, d_buffer); + cudaCheckLastError("Could not perform ilu decomposition"); int structural_zero; // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize() @@ -437,9 +471,9 @@ namespace Opm } // end create_preconditioner() - void cusparseSolverBackend::solve_system(BdaResult &res) { + void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) { // actually solve - gpu_pbicgstab(res); + gpu_pbicgstab(wellContribs, res); cudaStreamSynchronize(stream); cudaCheckLastError("Something went wrong during the GPU solve"); } // end solve_system() @@ -449,10 +483,6 @@ namespace Opm // caller must be sure that x is a valid array void cusparseSolverBackend::post_process(double *x) { - if (!initialized) { - cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault); - } - double t1, t2; if (verbosity > 2) { t1 = second(); @@ -472,12 +502,12 @@ namespace Opm typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus; - cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) { + cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { if (initialized == false) { initialize(N, nnz, dim); copy_system_to_gpu(vals, rows, cols, b); }else{ - update_system_on_gpu(vals, b); + update_system_on_gpu(vals, rows, b); } if (analysis_done == false) { if (!analyse_matrix()) { @@ -486,7 +516,7 @@ namespace Opm } reset_prec_on_gpu(); if (create_preconditioner()) { - solve_system(res); + solve_system(wellContribs, res); }else{ return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED; } diff --git a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp index e8d88ca29..b8648ef15 100644 --- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp +++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp @@ -25,6 +25,7 @@ #include "cusparse_v2.h" #include "opm/simulators/linalg/bda/BdaResult.hpp" +#include namespace Opm { @@ -50,13 +51,11 @@ private: int *d_bRows, *d_mRows; double *d_x, *d_b, *d_r, *d_rw, *d_p; double *d_pw, *d_s, *d_t, *d_v; - double *vals; - int *cols, *rows; - double *x, *b; void *d_buffer; int N, Nb, nnz, nnzb; + double *vals_contiguous; - int BLOCK_SIZE; + int block_size; bool initialized = false; bool analysis_done = false; @@ -70,8 +69,9 @@ private: int verbosity = 0; /// Solve linear system using ilu0-bicgstab - /// \param[inout] res summary of solver result - void gpu_pbicgstab(BdaResult& res); + /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] res summary of solver result + void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); /// Initialize GPU and allocate memory /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks @@ -83,16 +83,17 @@ private: void finalize(); /// Copy linear system to GPU - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values /// \param[in] rows array of rowPointers, contains N/dim+1 values /// \param[in] cols array of columnIndices, contains nnz values /// \param[in] b input vector, contains N values void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b); // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values + /// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true /// \param[in] b input vector, contains N values - void update_system_on_gpu(double *vals, double *b); + void update_system_on_gpu(double *vals, int *rows, double *b); /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse void reset_prec_on_gpu(); @@ -106,8 +107,9 @@ private: bool create_preconditioner(); /// Solve linear system - /// \param[inout] res summary of solver result - void solve_system(BdaResult &res); + /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] res summary of solver result + void solve_system(WellContributions& wellContribs, BdaResult &res); public: @@ -128,16 +130,17 @@ public: ~cusparseSolverBackend(); /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format - /// \param[in] N number of rows, divide by dim to get number of blockrows - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnz values - /// \param[in] b input vector, contains N values - /// \param[inout] res summary of solver result - /// \return status code - cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res); + /// \param[in] N number of rows, divide by dim to get number of blockrows + /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks + /// \param[in] dim size of block + /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] rows array of rowPointers, contains N/dim+1 values + /// \param[in] cols array of columnIndices, contains nnz values + /// \param[in] b input vector, contains N values + /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] res summary of solver result + /// \return status code + cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res); /// Post processing after linear solve, now only copies resulting x vector back /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array diff --git a/opm/simulators/wells/BlackoilWellModel.hpp b/opm/simulators/wells/BlackoilWellModel.hpp index e559aba5d..a5c9e817d 100644 --- a/opm/simulators/wells/BlackoilWellModel.hpp +++ b/opm/simulators/wells/BlackoilWellModel.hpp @@ -192,6 +192,9 @@ namespace Opm { // subtract B*inv(D)*C * x from A*x void apply(const BVector& x, BVector& Ax) const; + // accumulate the contributions of all Wells in the WellContributions object + void getWellContributions(WellContributions& x) const; + // apply well model with scaling of alpha void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const; diff --git a/opm/simulators/wells/BlackoilWellModel_impl.hpp b/opm/simulators/wells/BlackoilWellModel_impl.hpp index ad47b484f..8e7ccdacc 100644 --- a/opm/simulators/wells/BlackoilWellModel_impl.hpp +++ b/opm/simulators/wells/BlackoilWellModel_impl.hpp @@ -919,7 +919,25 @@ namespace Opm { } - + template + void + BlackoilWellModel:: + getWellContributions(WellContributions& wellContribs) const + { + for(unsigned int i = 0; i < well_container_.size(); i++){ + auto& well = well_container_[i]; + std::shared_ptr > derived = std::dynamic_pointer_cast >(well); + unsigned int nnz, numEq, numWellEq; + derived->getWellSizes(nnz, numEq, numWellEq); + wellContribs.addSizes(nnz, numEq, numWellEq); + } + wellContribs.alloc_all(); + for(unsigned int i = 0; i < well_container_.size(); i++){ + auto& well = well_container_[i]; + std::shared_ptr > derived = std::dynamic_pointer_cast >(well); + derived->addWellContribution(wellContribs); + } + } // Ax = Ax - alpha * C D^-1 B x diff --git a/opm/simulators/wells/StandardWell.hpp b/opm/simulators/wells/StandardWell.hpp index 69a8a9629..5789fd7cd 100644 --- a/opm/simulators/wells/StandardWell.hpp +++ b/opm/simulators/wells/StandardWell.hpp @@ -184,6 +184,12 @@ namespace Opm /// r = r - C D^-1 Rw virtual void apply(BVector& r) const override; + /// add the contribution (C, D^-1, B matrices) of this Well to the WellContributions object + void addWellContribution(WellContributions& wellContribs) const; + + /// get the sizes of the C, D^-1 and B matrices, used to allocate memory in a WellContributions object + void getWellSizes(unsigned int& _nnzs, unsigned int& _numEq, unsigned int& _numWellEq) const; + /// using the solution x to recover the solution xw for wells and applying /// xw to update Well State virtual void recoverWellSolutionAndUpdateWellState(const BVector& x, diff --git a/opm/simulators/wells/StandardWell_impl.hpp b/opm/simulators/wells/StandardWell_impl.hpp index c9bfabd34..7144d6429 100644 --- a/opm/simulators/wells/StandardWell_impl.hpp +++ b/opm/simulators/wells/StandardWell_impl.hpp @@ -2736,6 +2736,65 @@ namespace Opm } + template + void + StandardWell:: + addWellContribution(WellContributions& wellContribs) const + { + std::vector colIndices; + std::vector nnzValues; + colIndices.reserve(duneB_.nonzeroes()); + nnzValues.reserve(duneB_.nonzeroes()*numStaticWellEq * numEq); + + // duneC + for ( auto colC = duneC_[0].begin(), endC = duneC_[0].end(); colC != endC; ++colC ) + { + colIndices.emplace_back(colC.index()); + for (int i = 0; i < numStaticWellEq; ++i) { + for (int j = 0; j < numEq; ++j) { + nnzValues.emplace_back((*colC)[i][j]); + } + } + } + wellContribs.addMatrix(0, colIndices.data(), nnzValues.data(), duneC_.nonzeroes()); + + // invDuneD + colIndices.clear(); + nnzValues.clear(); + colIndices.emplace_back(0); + for (int i = 0; i < numStaticWellEq; ++i) + { + for (int j = 0; j < numStaticWellEq; ++j) { + nnzValues.emplace_back(invDuneD_[0][0][i][j]); + } + } + wellContribs.addMatrix(1, colIndices.data(), nnzValues.data(), 1); + + // duneB + colIndices.clear(); + nnzValues.clear(); + for ( auto colB = duneB_[0].begin(), endB = duneB_[0].end(); colB != endB; ++colB ) + { + colIndices.emplace_back(colB.index()); + for (int i = 0; i < numStaticWellEq; ++i) { + for (int j = 0; j < numEq; ++j) { + nnzValues.emplace_back((*colB)[i][j]); + } + } + } + wellContribs.addMatrix(2, colIndices.data(), nnzValues.data(), duneB_.nonzeroes()); + } + + + template + void + StandardWell:: + getWellSizes(unsigned int& _nnzs, unsigned int& _numEq, unsigned int& _numWellEq) const + { + _nnzs = duneB_.nonzeroes(); + _numEq = numEq; + _numWellEq = numStaticWellEq; + }