mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Added StandardWell functionality to OpenCL backend
This commit is contained in:
parent
4b45623333
commit
56c1c0862c
@ -181,7 +181,7 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
|
|||||||
#else
|
#else
|
||||||
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
||||||
if (gpu_mode.compare("none") != 0) {
|
if (gpu_mode.compare("none") != 0) {
|
||||||
OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL was not found by cmake");
|
OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
||||||
@ -465,7 +465,8 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
|
|||||||
#if HAVE_CUDA || HAVE_OPENCL
|
#if HAVE_CUDA || HAVE_OPENCL
|
||||||
bool use_gpu = bdaBridge->getUseGpu();
|
bool use_gpu = bdaBridge->getUseGpu();
|
||||||
if (use_gpu) {
|
if (use_gpu) {
|
||||||
WellContributions wellContribs;
|
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
||||||
|
WellContributions wellContribs(gpu_mode);
|
||||||
if (!useWellConn_) {
|
if (!useWellConn_) {
|
||||||
simulator_.problem().wellModel().getWellContributions(wellContribs);
|
simulator_.problem().wellModel().getWellContributions(wellContribs);
|
||||||
}
|
}
|
||||||
@ -478,7 +479,13 @@ DenseMatrix transposeDenseMatrix(const DenseMatrix& M)
|
|||||||
// CPU fallback
|
// CPU fallback
|
||||||
use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver
|
use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver
|
||||||
if (use_gpu) {
|
if (use_gpu) {
|
||||||
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
|
if(gpu_mode.compare("cusparse") == 0){
|
||||||
|
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(gpu_mode.compare("opencl") == 0){
|
||||||
|
OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system...");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// call Dune
|
// call Dune
|
||||||
|
@ -24,79 +24,202 @@
|
|||||||
|
|
||||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||||
#include "opm/simulators/linalg/bda/WellContributions.hpp"
|
#include "opm/simulators/linalg/bda/WellContributions.hpp"
|
||||||
|
|
||||||
namespace Opm
|
namespace Opm
|
||||||
{
|
{
|
||||||
|
|
||||||
|
WellContributions::WellContributions(std::string gpu_mode){
|
||||||
|
if(gpu_mode.compare("cusparse") == 0){
|
||||||
|
cuda_gpu = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(gpu_mode.compare("opencl") == 0){
|
||||||
|
opencl_gpu = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void WellContributions::alloc()
|
void WellContributions::alloc()
|
||||||
{
|
{
|
||||||
if (num_std_wells > 0) {
|
if (num_std_wells > 0) {
|
||||||
#if HAVE_CUDA
|
#if HAVE_CUDA
|
||||||
allocStandardWells();
|
if(cuda_gpu){
|
||||||
#else
|
allocStandardWells();
|
||||||
OPM_THROW(std::logic_error, "Error cannot allocate on GPU for StandardWells because CUDA was not found by cmake");
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
if(opencl_gpu){
|
||||||
|
h_Cnnzs_ocl = new double[num_blocks * dim * dim_wells];
|
||||||
|
h_Dnnzs_ocl = new double[num_std_wells * dim_wells * dim_wells];
|
||||||
|
h_Bnnzs_ocl = new double[num_blocks * dim * dim_wells];
|
||||||
|
h_Ccols_ocl = new int[num_blocks];
|
||||||
|
h_Bcols_ocl = new int[num_blocks];
|
||||||
|
val_pointers = new unsigned int[num_std_wells + 1];
|
||||||
|
|
||||||
|
allocated = true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !HAVE_CUDA && !HAVE_OPENCL
|
||||||
|
OPM_THROW(std::logic_error, "Error cannot allocate on GPU because neither CUDA nor OpenCL were found by cmake");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
WellContributions::~WellContributions()
|
WellContributions::~WellContributions()
|
||||||
{
|
{
|
||||||
#if HAVE_CUDA
|
|
||||||
freeCudaMemory();
|
|
||||||
#endif
|
|
||||||
if (h_x) {
|
|
||||||
delete[] h_x;
|
|
||||||
delete[] h_y;
|
|
||||||
}
|
|
||||||
|
|
||||||
// delete MultisegmentWellContributions
|
// delete MultisegmentWellContributions
|
||||||
for (auto ms : multisegments) {
|
for (auto ms : multisegments) {
|
||||||
delete ms;
|
delete ms;
|
||||||
}
|
}
|
||||||
multisegments.clear();
|
multisegments.clear();
|
||||||
}
|
|
||||||
|
|
||||||
|
#if HAVE_CUDA
|
||||||
|
if(cuda_gpu){
|
||||||
|
freeCudaMemory(); // should come before 'delete[] h_x'
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if HAVE_OPENCL
|
#if HAVE_OPENCL
|
||||||
void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) {
|
if (h_x_ocl) {
|
||||||
|
delete[] h_x_ocl;
|
||||||
|
delete[] h_y_ocl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(opencl_gpu){
|
||||||
|
if(num_std_wells > 0){
|
||||||
|
delete[] h_Cnnzs_ocl;
|
||||||
|
delete[] h_Dnnzs_ocl;
|
||||||
|
delete[] h_Bnnzs_ocl;
|
||||||
|
delete[] h_Ccols_ocl;
|
||||||
|
delete[] h_Bcols_ocl;
|
||||||
|
delete[] val_pointers;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
|
||||||
|
void WellContributions::init(cl::Context *context){
|
||||||
|
d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
|
||||||
|
d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
|
||||||
|
d_Bnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
|
||||||
|
d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
|
||||||
|
d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
|
||||||
|
d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
void WellContributions::copyDataToGPU(cl::CommandQueue *queue){
|
||||||
|
cl::Event event;
|
||||||
|
|
||||||
|
queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
|
||||||
|
queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
|
||||||
|
queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
|
||||||
|
queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
|
||||||
|
queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
|
||||||
|
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event);
|
||||||
|
event.wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) {
|
||||||
// apply MultisegmentWells
|
// apply MultisegmentWells
|
||||||
if (num_ms_wells > 0) {
|
if (num_ms_wells > 0) {
|
||||||
// allocate pinned memory on host if not yet done
|
// allocate pinned memory on host if not yet done
|
||||||
if (h_x == nullptr) {
|
if (h_x_ocl == nullptr) {
|
||||||
h_x = new double[N];
|
h_x_ocl = new double[N];
|
||||||
h_y = new double[N];
|
h_y_ocl = new double[N];
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy vectors x and y from GPU to CPU
|
// copy vectors x and y from GPU to CPU
|
||||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x);
|
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl);
|
||||||
queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
|
queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
|
||||||
|
|
||||||
// actually apply MultisegmentWells
|
// actually apply MultisegmentWells
|
||||||
for (MultisegmentWellContribution *well : multisegments) {
|
for (MultisegmentWellContribution *well : multisegments) {
|
||||||
well->apply(h_x, h_y);
|
well->apply(h_x_ocl, h_y_ocl);
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy vector y from CPU to GPU
|
// copy vector y from CPU to GPU
|
||||||
queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
|
queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// apply StandardWells
|
|
||||||
if (num_std_wells > 0) {
|
|
||||||
OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
|
||||||
|
const unsigned int work_group_size = 32;
|
||||||
|
const unsigned int total_work_items = num_std_wells * work_group_size;
|
||||||
|
const unsigned int lmem1 = sizeof(double) * work_group_size;
|
||||||
|
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
||||||
|
|
||||||
|
cl::Event event;
|
||||||
|
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||||
|
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
|
||||||
|
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||||
|
event.wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
|
||||||
|
if(num_std_wells > 0){
|
||||||
|
applyStdWell(queue, d_x, d_y, kernel);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(num_ms_wells > 0){
|
||||||
|
applyMSWell(queue, d_x, d_y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]]int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
|
void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
|
||||||
{
|
{
|
||||||
|
if (!allocated) {
|
||||||
|
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
|
||||||
|
}
|
||||||
|
|
||||||
#if HAVE_CUDA
|
#if HAVE_CUDA
|
||||||
|
if(cuda_gpu){
|
||||||
addMatrixGpu(type, colIndices, values, val_size);
|
addMatrixGpu(type, colIndices, values, val_size);
|
||||||
#else
|
}
|
||||||
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because CUDA was not found by cmake");
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
if(opencl_gpu){
|
||||||
|
switch (type) {
|
||||||
|
case MatrixType::C:
|
||||||
|
std::copy(colIndices, colIndices + val_size, h_Ccols_ocl + num_blocks_so_far);
|
||||||
|
std::copy(values, values + val_size*dim*dim_wells, h_Cnnzs_ocl + num_blocks_so_far*dim*dim_wells);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case MatrixType::D:
|
||||||
|
std::copy(values, values + dim_wells*dim_wells, h_Dnnzs_ocl + num_std_wells_so_far*dim_wells*dim_wells);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case MatrixType::B:
|
||||||
|
std::copy(colIndices, colIndices + val_size, h_Bcols_ocl + num_blocks_so_far);
|
||||||
|
std::copy(values, values + val_size*dim*dim_wells, h_Bnnzs_ocl + num_blocks_so_far*dim*dim_wells);
|
||||||
|
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
|
||||||
|
|
||||||
|
if(num_std_wells_so_far == num_std_wells - 1){
|
||||||
|
val_pointers[num_std_wells] = num_blocks;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MatrixType::B == type) {
|
||||||
|
num_blocks_so_far += val_size;
|
||||||
|
num_std_wells_so_far++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !HAVE_CUDA && !HAVE_OPENCL
|
||||||
|
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -107,23 +230,23 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
|||||||
dim_wells = dim_wells_;
|
dim_wells = dim_wells_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributions::addNumBlocks(unsigned int nnz)
|
void WellContributions::addNumBlocks(unsigned int numBlocks)
|
||||||
{
|
{
|
||||||
if (allocated) {
|
if (allocated) {
|
||||||
OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
|
OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
|
||||||
}
|
}
|
||||||
num_blocks += nnz;
|
num_blocks += numBlocks;
|
||||||
num_std_wells++;
|
num_std_wells++;
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
|
void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||||
unsigned int Nb, unsigned int Mb,
|
unsigned int Nb, unsigned int Mb,
|
||||||
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||||
std::vector<double> &Cvalues)
|
std::vector<double> &Cvalues)
|
||||||
{
|
{
|
||||||
this->N = Nb * dim_;
|
this->N = Nb * dim;
|
||||||
MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells_, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
|
MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
|
||||||
multisegments.emplace_back(well);
|
multisegments.emplace_back(well);
|
||||||
++num_ms_wells;
|
++num_ms_wells;
|
||||||
}
|
}
|
||||||
@ -138,12 +261,5 @@ void WellContributions::setReordering(int *toOrder_, bool reorder_)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HAVE_OPENCL
|
|
||||||
void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_)
|
|
||||||
{
|
|
||||||
this->queue = queue_;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
} //namespace Opm
|
} //namespace Opm
|
||||||
|
|
||||||
|
@ -57,7 +57,6 @@ class WellContributions
|
|||||||
{
|
{
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/// StandardWell has C, D and B matrices that need to be copied
|
/// StandardWell has C, D and B matrices that need to be copied
|
||||||
enum class MatrixType {
|
enum class MatrixType {
|
||||||
C,
|
C,
|
||||||
@ -78,17 +77,16 @@ private:
|
|||||||
bool allocated = false;
|
bool allocated = false;
|
||||||
std::vector<MultisegmentWellContribution*> multisegments;
|
std::vector<MultisegmentWellContribution*> multisegments;
|
||||||
|
|
||||||
|
int *toOrder = nullptr;
|
||||||
|
bool reorder = false;
|
||||||
|
|
||||||
|
bool opencl_gpu = false;
|
||||||
|
bool cuda_gpu = false;
|
||||||
|
|
||||||
#if HAVE_CUDA
|
#if HAVE_CUDA
|
||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
#endif
|
|
||||||
|
|
||||||
#if HAVE_OPENCL
|
|
||||||
cl::CommandQueue *queue = nullptr;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if HAVE_CUDA
|
|
||||||
// data for StandardWells, could remain nullptrs if not used
|
// data for StandardWells, could remain nullptrs if not used
|
||||||
// StandardWells are only supported for cusparseSolver now
|
|
||||||
double *d_Cnnzs = nullptr;
|
double *d_Cnnzs = nullptr;
|
||||||
double *d_Dnnzs = nullptr;
|
double *d_Dnnzs = nullptr;
|
||||||
double *d_Bnnzs = nullptr;
|
double *d_Bnnzs = nullptr;
|
||||||
@ -97,13 +95,29 @@ private:
|
|||||||
double *d_z1 = nullptr;
|
double *d_z1 = nullptr;
|
||||||
double *d_z2 = nullptr;
|
double *d_z2 = nullptr;
|
||||||
unsigned int *d_val_pointers = nullptr;
|
unsigned int *d_val_pointers = nullptr;
|
||||||
|
double *h_x = nullptr;
|
||||||
|
double *h_y = nullptr;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy
|
#if HAVE_OPENCL
|
||||||
|
double *h_Cnnzs_ocl = nullptr;
|
||||||
|
double *h_Dnnzs_ocl = nullptr;
|
||||||
|
double *h_Bnnzs_ocl = nullptr;
|
||||||
|
int *h_Ccols_ocl = nullptr;
|
||||||
|
int *h_Bcols_ocl = nullptr;
|
||||||
|
double *h_x_ocl = nullptr;
|
||||||
|
double *h_y_ocl = nullptr;
|
||||||
|
|
||||||
int *toOrder = nullptr;
|
cl::Buffer d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
|
||||||
bool reorder = false;
|
cl::Buffer d_Ccols_ocl, d_Bcols_ocl, d_val_pointers_ocl;
|
||||||
|
|
||||||
|
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, const unsigned int, const unsigned int,
|
||||||
|
cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_CUDA
|
||||||
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
|
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
|
||||||
/// \param[in] type indicate if C, D or B is sent
|
/// \param[in] type indicate if C, D or B is sent
|
||||||
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
||||||
@ -116,32 +130,43 @@ private:
|
|||||||
|
|
||||||
/// Free GPU memory allocated with cuda.
|
/// Free GPU memory allocated with cuda.
|
||||||
void freeCudaMemory();
|
void freeCudaMemory();
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
/// Set a cudaStream to be used
|
|
||||||
/// \param[in] stream the cudaStream that is used to launch the kernel in
|
|
||||||
#if HAVE_CUDA
|
|
||||||
void setCudaStream(cudaStream_t stream);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// Create a new WellContributions, implementation is empty
|
#if HAVE_OPENCL
|
||||||
WellContributions() {};
|
void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel);
|
||||||
|
void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y);
|
||||||
|
#endif
|
||||||
|
|
||||||
/// Destroy a WellContributions, and free memory
|
public:
|
||||||
~WellContributions();
|
#if HAVE_CUDA
|
||||||
|
/// Set a cudaStream to be used
|
||||||
|
/// \param[in] stream the cudaStream that is used to launch the kernel in
|
||||||
|
void setCudaStream(cudaStream_t stream);
|
||||||
|
|
||||||
/// Apply all Wells in this object
|
/// Apply all Wells in this object
|
||||||
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
|
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
|
||||||
/// \param[in] d_x vector x, must be on GPU
|
/// \param[in] d_x vector x, must be on GPU
|
||||||
/// \param[inout] d_y vector y, must be on GPU
|
/// \param[inout] d_y vector y, must be on GPU
|
||||||
#if HAVE_CUDA
|
|
||||||
void apply(double *d_x, double *d_y);
|
void apply(double *d_x, double *d_y);
|
||||||
|
|
||||||
|
unsigned int getNumWells(){
|
||||||
|
return num_std_wells + num_ms_wells;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if HAVE_OPENCL
|
#if HAVE_OPENCL
|
||||||
void apply(cl::Buffer& x, cl::Buffer& y);
|
void init(cl::Context *context);
|
||||||
|
void copyDataToGPU(cl::CommandQueue *queue);
|
||||||
|
void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/// Create a new WellContributions
|
||||||
|
WellContributions(std::string gpu_mode);
|
||||||
|
|
||||||
|
/// Destroy a WellContributions, and free memory
|
||||||
|
~WellContributions();
|
||||||
|
|
||||||
|
|
||||||
/// Allocate memory for the StandardWells
|
/// Allocate memory for the StandardWells
|
||||||
void alloc();
|
void alloc();
|
||||||
|
|
||||||
@ -182,24 +207,11 @@ public:
|
|||||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||||
std::vector<double> &Cvalues);
|
std::vector<double> &Cvalues);
|
||||||
|
|
||||||
/// Return the number of wells added to this object
|
|
||||||
/// \return the number of wells added to this object
|
|
||||||
unsigned int getNumWells() {
|
|
||||||
return num_std_wells + num_ms_wells;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
|
/// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
|
||||||
/// Those indices need to be mapped via toOrder
|
/// Those indices need to be mapped via toOrder
|
||||||
/// \param[in] toOrder array with mappings
|
/// \param[in] toOrder array with mappings
|
||||||
/// \param[in] reorder whether the columnindices need to be reordered or not
|
/// \param[in] reorder whether the columnindices need to be reordered or not
|
||||||
void setReordering(int *toOrder, bool reorder);
|
void setReordering(int *toOrder, bool reorder);
|
||||||
|
|
||||||
#if HAVE_OPENCL
|
|
||||||
/// This object copies some cl::Buffers, it requires a CommandQueue to do that
|
|
||||||
/// \param[in] queue the opencl commandqueue to be used
|
|
||||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} //namespace Opm
|
} //namespace Opm
|
||||||
|
@ -23,7 +23,7 @@
|
|||||||
namespace bda
|
namespace bda
|
||||||
{
|
{
|
||||||
|
|
||||||
const char* axpy_s = R"(
|
inline const char* axpy_s = R"(
|
||||||
__kernel void axpy(
|
__kernel void axpy(
|
||||||
__global double *in,
|
__global double *in,
|
||||||
const double a,
|
const double a,
|
||||||
@ -42,7 +42,7 @@ namespace bda
|
|||||||
|
|
||||||
|
|
||||||
// returns partial sums, instead of the final dot product
|
// returns partial sums, instead of the final dot product
|
||||||
const char* dot_1_s = R"(
|
inline const char* dot_1_s = R"(
|
||||||
__kernel void dot_1(
|
__kernel void dot_1(
|
||||||
__global double *in1,
|
__global double *in1,
|
||||||
__global double *in2,
|
__global double *in2,
|
||||||
@ -83,7 +83,7 @@ namespace bda
|
|||||||
|
|
||||||
// returns partial sums, instead of the final norm
|
// returns partial sums, instead of the final norm
|
||||||
// the square root must be computed on CPU
|
// the square root must be computed on CPU
|
||||||
const char* norm_s = R"(
|
inline const char* norm_s = R"(
|
||||||
__kernel void norm(
|
__kernel void norm(
|
||||||
__global double *in,
|
__global double *in,
|
||||||
__global double *out,
|
__global double *out,
|
||||||
@ -122,7 +122,7 @@ namespace bda
|
|||||||
|
|
||||||
|
|
||||||
// p = (p - omega * v) * beta + r
|
// p = (p - omega * v) * beta + r
|
||||||
const char* custom_s = R"(
|
inline const char* custom_s = R"(
|
||||||
__kernel void custom(
|
__kernel void custom(
|
||||||
__global double *p,
|
__global double *p,
|
||||||
__global double *v,
|
__global double *v,
|
||||||
@ -150,7 +150,7 @@ namespace bda
|
|||||||
// algorithm based on:
|
// algorithm based on:
|
||||||
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||||
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||||
const char* spmv_blocked_s = R"(
|
inline const char* spmv_blocked_s = R"(
|
||||||
__kernel void spmv_blocked(
|
__kernel void spmv_blocked(
|
||||||
__global const double *vals,
|
__global const double *vals,
|
||||||
__global const int *cols,
|
__global const int *cols,
|
||||||
@ -220,7 +220,7 @@ namespace bda
|
|||||||
|
|
||||||
// ILU apply part 1: forward substitution
|
// ILU apply part 1: forward substitution
|
||||||
// solves L*x=y where L is a lower triangular sparse blocked matrix
|
// solves L*x=y where L is a lower triangular sparse blocked matrix
|
||||||
const char* ILU_apply1_s = R"(
|
inline const char* ILU_apply1_s = R"(
|
||||||
__kernel void ILU_apply1(
|
__kernel void ILU_apply1(
|
||||||
__global const double *Lvals,
|
__global const double *Lvals,
|
||||||
__global const unsigned int *Lcols,
|
__global const unsigned int *Lcols,
|
||||||
@ -290,7 +290,7 @@ namespace bda
|
|||||||
|
|
||||||
// ILU apply part 2: backward substitution
|
// ILU apply part 2: backward substitution
|
||||||
// solves U*x=y where L is a lower triangular sparse blocked matrix
|
// solves U*x=y where L is a lower triangular sparse blocked matrix
|
||||||
const char* ILU_apply2_s = R"(
|
inline const char* ILU_apply2_s = R"(
|
||||||
__kernel void ILU_apply2(
|
__kernel void ILU_apply2(
|
||||||
__global const double *Uvals,
|
__global const double *Uvals,
|
||||||
__global const int *Ucols,
|
__global const int *Ucols,
|
||||||
@ -366,7 +366,99 @@ namespace bda
|
|||||||
}
|
}
|
||||||
)";
|
)";
|
||||||
|
|
||||||
|
inline const char* add_well_contributions_s = R"(
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
|
||||||
|
|
||||||
|
void atomicAdd(volatile __global double *val, const double delta){
|
||||||
|
union{
|
||||||
|
double f;
|
||||||
|
ulong i;
|
||||||
|
} old;
|
||||||
|
|
||||||
|
union{
|
||||||
|
double f;
|
||||||
|
ulong i;
|
||||||
|
} new;
|
||||||
|
|
||||||
|
do{
|
||||||
|
old.f = *val;
|
||||||
|
new.f = old.f + delta;
|
||||||
|
} while(atom_cmpxchg((volatile __global ulong *)val, old.i, new.i) != old.i);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void add_well_contributions(__global const double *valsC,
|
||||||
|
__global const double *valsD,
|
||||||
|
__global const double *valsB,
|
||||||
|
__global const int *colsC,
|
||||||
|
__global const int *colsB,
|
||||||
|
__global const double *x,
|
||||||
|
__global double *y,
|
||||||
|
const unsigned int blnc,
|
||||||
|
const unsigned int blnr,
|
||||||
|
__global const unsigned int *rowptr,
|
||||||
|
__local double *localSum,
|
||||||
|
__local double *z1,
|
||||||
|
__local double *z2){
|
||||||
|
int wgId = get_group_id(0);
|
||||||
|
int wiId = get_local_id(0);
|
||||||
|
int valSize = rowptr[wgId + 1] - rowptr[wgId];
|
||||||
|
int valsPerBlock = blnc*blnr;
|
||||||
|
int numActiveWorkItems = (32/valsPerBlock)*valsPerBlock;
|
||||||
|
int numBlocksPerWarp = 32/valsPerBlock;
|
||||||
|
int c = wiId % blnc;
|
||||||
|
int r = (wiId/blnc) % blnr;
|
||||||
|
double temp;
|
||||||
|
|
||||||
|
localSum[wiId] = 0;
|
||||||
|
if(wiId < numActiveWorkItems){
|
||||||
|
int b = wiId/valsPerBlock + rowptr[wgId];
|
||||||
|
while(b < valSize + rowptr[wgId]){
|
||||||
|
int colIdx = colsB[b];
|
||||||
|
localSum[wiId] += valsB[b*blnc*blnr + r*blnc + c]*x[colIdx*blnc + c];
|
||||||
|
b += numBlocksPerWarp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int stride = valsPerBlock;
|
||||||
|
if(wiId < stride){
|
||||||
|
localSum[wiId] += localSum[wiId + stride];
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(c == 0 && wiId < valsPerBlock){
|
||||||
|
for(stride = 2; stride > 0; stride /= 2){
|
||||||
|
localSum[wiId] += localSum[wiId + stride];
|
||||||
|
}
|
||||||
|
z1[r] = localSum[wiId];
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < blnr){
|
||||||
|
temp = 0.0;
|
||||||
|
for(unsigned int i = 0; i < blnr; ++i){
|
||||||
|
temp += valsD[wgId*blnr*blnr + wiId*blnr + i]*z1[i];
|
||||||
|
}
|
||||||
|
z2[wiId] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < blnc*valSize){
|
||||||
|
temp = 0.0;
|
||||||
|
int bb = wiId/blnc + rowptr[wgId];
|
||||||
|
int colIdx = colsC[bb];
|
||||||
|
for (unsigned int j = 0; j < blnr; ++j){
|
||||||
|
temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j];
|
||||||
|
}
|
||||||
|
atomicAdd(&y[colIdx*blnc + c], temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
|
||||||
} // end namespace bda
|
} // end namespace bda
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
#include <config.h>
|
#include <config.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
@ -70,7 +71,7 @@ unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A,
|
|||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
|
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size = 1024;
|
const unsigned int work_group_size = 256;
|
||||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||||
@ -98,7 +99,7 @@ double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl
|
|||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
|
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size = 1024;
|
const unsigned int work_group_size = 256;
|
||||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||||
@ -179,19 +180,14 @@ void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||||
|
|
||||||
float it;
|
float it;
|
||||||
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
||||||
double norm, norm_0;
|
double norm, norm_0;
|
||||||
|
|
||||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||||
|
|
||||||
wellContribs.setOpenCLQueue(queue.get());
|
|
||||||
wellContribs.setReordering(toOrder, true);
|
|
||||||
|
|
||||||
// set r to the initial residual
|
// set r to the initial residual
|
||||||
// if initial x guess is not 0, must call applyblockedscaleadd(), not implemented
|
// if initial x guess is not 0, must call applyblockedscaleadd(), not implemented
|
||||||
//applyblockedscaleadd(-1.0, mat, x, r);
|
//applyblockedscaleadd(-1.0, mat, x, r);
|
||||||
@ -212,6 +208,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
|
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||||
event.wait();
|
event.wait();
|
||||||
|
|
||||||
|
wellContribs.setReordering(toOrder, true);
|
||||||
|
|
||||||
norm = norm_w(d_r, d_tmp);
|
norm = norm_w(d_r, d_tmp);
|
||||||
norm_0 = norm;
|
norm_0 = norm;
|
||||||
|
|
||||||
@ -243,11 +241,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
t_spmv.stop();
|
t_spmv.stop();
|
||||||
|
|
||||||
// apply wellContributions
|
// apply wellContributions
|
||||||
if (wellContribs.getNumWells() > 0) {
|
t_well.start();
|
||||||
t_well.start();
|
wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get());
|
||||||
wellContribs.apply(d_pw, d_v);
|
t_well.stop();
|
||||||
t_well.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
t_rest.start();
|
t_rest.start();
|
||||||
tmp1 = dot_w(d_rw, d_v, d_tmp);
|
tmp1 = dot_w(d_rw, d_v, d_tmp);
|
||||||
@ -274,11 +270,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
t_spmv.stop();
|
t_spmv.stop();
|
||||||
|
|
||||||
// apply wellContributions
|
// apply wellContributions
|
||||||
if (wellContribs.getNumWells() > 0) {
|
t_well.start();
|
||||||
t_well.start();
|
wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get());
|
||||||
wellContribs.apply(d_s, d_t);
|
t_well.stop();
|
||||||
t_well.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
t_rest.start();
|
t_rest.start();
|
||||||
tmp1 = dot_w(d_t, d_r, d_tmp);
|
tmp1 = dot_w(d_t, d_r, d_tmp);
|
||||||
@ -314,7 +308,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
}
|
}
|
||||||
if (verbosity >= 4) {
|
if (verbosity >= 4) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "openclSolver::ily_apply: " << t_prec.elapsed() << " s\n";
|
out << "openclSolver::ilu_apply: " << t_prec.elapsed() << " s\n";
|
||||||
|
out << "wellContributions::apply: " << t_well.elapsed() << " s\n";
|
||||||
out << "openclSolver::spmv: " << t_spmv.elapsed() << " s\n";
|
out << "openclSolver::spmv: " << t_spmv.elapsed() << " s\n";
|
||||||
out << "openclSolver::rest: " << t_rest.elapsed() << " s\n";
|
out << "openclSolver::rest: " << t_rest.elapsed() << " s\n";
|
||||||
out << "openclSolver::total_solve: " << res.elapsed << " s\n";
|
out << "openclSolver::total_solve: " << res.elapsed << " s\n";
|
||||||
@ -324,7 +319,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
|
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
|
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs) {
|
||||||
this->N = N_;
|
this->N = N_;
|
||||||
this->nnz = nnz_;
|
this->nnz = nnz_;
|
||||||
this->nnzb = nnz_ / block_size / block_size;
|
this->nnzb = nnz_ / block_size / block_size;
|
||||||
@ -466,6 +461,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
|
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
|
||||||
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
||||||
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
||||||
|
source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s)));
|
||||||
cl::Program program_ = cl::Program(*context, source);
|
cl::Program program_ = cl::Program(*context, source);
|
||||||
|
|
||||||
program_.build(devices);
|
program_.build(devices);
|
||||||
@ -481,7 +477,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
vals_contiguous = new double[N];
|
vals_contiguous = new double[N];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
|
mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
|
||||||
|
|
||||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||||
@ -500,6 +495,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
||||||
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||||
|
|
||||||
|
wellContribs.init(context.get());
|
||||||
|
|
||||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||||
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||||
|
|
||||||
@ -511,6 +508,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
|
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
|
||||||
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
|
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
|
||||||
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
|
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
|
||||||
|
add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program_, "add_well_contributions")));
|
||||||
|
|
||||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
||||||
|
|
||||||
@ -541,7 +539,7 @@ void openclSolverBackend<block_size>::finalize() {
|
|||||||
|
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& wellContribs) {
|
||||||
Timer t;
|
Timer t;
|
||||||
cl::Event event;
|
cl::Event event;
|
||||||
|
|
||||||
@ -563,6 +561,8 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
|||||||
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
|
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||||
event.wait();
|
event.wait();
|
||||||
|
|
||||||
|
wellContribs.copyDataToGPU(queue.get());
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
|
out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
|
||||||
@ -706,7 +706,7 @@ void openclSolverBackend<block_size>::get_result(double *x) {
|
|||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||||
if (initialized == false) {
|
if (initialized == false) {
|
||||||
initialize(N_, nnz_, dim, vals, rows, cols);
|
initialize(N_, nnz_, dim, vals, rows, cols, wellContribs);
|
||||||
if (analysis_done == false) {
|
if (analysis_done == false) {
|
||||||
if (!analyse_matrix()) {
|
if (!analyse_matrix()) {
|
||||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||||
@ -716,7 +716,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
|
|||||||
if (!create_preconditioner()) {
|
if (!create_preconditioner()) {
|
||||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||||
}
|
}
|
||||||
copy_system_to_gpu();
|
copy_system_to_gpu(wellContribs);
|
||||||
} else {
|
} else {
|
||||||
update_system(vals, b);
|
update_system(vals, b);
|
||||||
if (!create_preconditioner()) {
|
if (!create_preconditioner()) {
|
||||||
|
@ -64,6 +64,13 @@ private:
|
|||||||
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
||||||
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
|
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
|
||||||
|
|
||||||
|
//unsigned int num_blocks, dim_, dim_wells, num_std_wells;
|
||||||
|
//unsigned int *h_val_pointers;
|
||||||
|
//int *h_Ccols, *h_Bcols;
|
||||||
|
//double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs;
|
||||||
|
//cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs;
|
||||||
|
//cl::Buffer d_Ccols, d_Bcols, d_val_pointers;
|
||||||
|
|
||||||
// shared pointers are also passed to other objects
|
// shared pointers are also passed to other objects
|
||||||
std::shared_ptr<cl::Context> context;
|
std::shared_ptr<cl::Context> context;
|
||||||
std::shared_ptr<cl::CommandQueue> queue;
|
std::shared_ptr<cl::CommandQueue> queue;
|
||||||
@ -74,6 +81,7 @@ private:
|
|||||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
|
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
|
||||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
|
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
|
||||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
|
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
|
||||||
|
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > add_well_contributions_k;
|
||||||
|
|
||||||
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
||||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||||
@ -127,6 +135,8 @@ private:
|
|||||||
/// \param[out] b output vector
|
/// \param[out] b output vector
|
||||||
void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
|
void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
|
||||||
|
|
||||||
|
//void add_well_contributions_w(cl::Buffer valsC, cl::Buffer valsD, cl::Buffer valsB, cl::Buffer colsC, cl::Buffer colsB, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers);
|
||||||
|
|
||||||
/// Solve linear system using ilu0-bicgstab
|
/// Solve linear system using ilu0-bicgstab
|
||||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[inout] res summary of solver result
|
||||||
@ -139,13 +149,13 @@ private:
|
|||||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||||
/// \param[in] cols array of columnIndices, contains nnz values
|
/// \param[in] cols array of columnIndices, contains nnz values
|
||||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs);
|
||||||
|
|
||||||
/// Clean memory
|
/// Clean memory
|
||||||
void finalize();
|
void finalize();
|
||||||
|
|
||||||
/// Copy linear system to GPU
|
/// Copy linear system to GPU
|
||||||
void copy_system_to_gpu();
|
void copy_system_to_gpu(WellContributions& wellContribs);
|
||||||
|
|
||||||
/// Reorder the linear system so it corresponds with the coloring
|
/// Reorder the linear system so it corresponds with the coloring
|
||||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
|
Loading…
Reference in New Issue
Block a user