Merge pull request #3089 from Tongdongq/exact-ilu-decomp-gpu

Exact ilu decomp gpu
This commit is contained in:
Markus Blatt 2021-03-08 17:25:04 +01:00 committed by GitHub
commit 3ec181e231
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 1056 additions and 750 deletions

View File

@ -60,6 +60,7 @@ if(OPENCL_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)

View File

@ -250,12 +250,9 @@ namespace Opm
if (use_gpu) {
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
WellContributions wellContribs(gpu_mode);
#if HAVE_OPENCL
if(gpu_mode.compare("opencl") == 0){
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(&bdaBridge->getBackend());
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
}
#endif
bdaBridge->initWellContributions(wellContribs);
if (!useWellConn_) {
simulator_.problem().wellModel().getWellContributions(wellContribs);
}

View File

@ -18,6 +18,7 @@
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/MatrixBlock.hpp>
@ -32,14 +33,6 @@
namespace bda
{
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
#define CHOW_PATEL 0
#define CHOW_PATEL_GPU 1
using Opm::OpmLog;
using Dune::Timer;
@ -126,41 +119,54 @@ namespace bda
diagIndex = new int[mat->Nb];
invDiagVals = new double[mat->Nb * bs * bs];
#if CHOW_PATEL
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
#endif
LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs);
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1));
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
#if CHOW_PATEL
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
#else
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
#endif
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals);
events.resize(2);
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]);
int *rowsPerColorPrefix = new int[numColors + 1];
rowsPerColorPrefix[0] = 0;
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
for (int i = 0; i < numColors; ++i) {
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
}
queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
delete[] rowsPerColorPrefix;
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
return true;
} // end init()
// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
template <unsigned int block_size>
void BILU0<block_size>::chow_patel_decomposition()
{
#if CHOW_PATEL
const unsigned int bs = block_size;
int num_sweeps = 6;
@ -196,6 +202,8 @@ namespace bda
}
#endif
Timer t_total, t_preprocessing;
// Ut is actually BSC format
std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
@ -274,7 +282,14 @@ namespace bda
// Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
double *Utmp = new double[Ut->nnzbs * block_size * block_size];
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 ChowPatel preprocessing: " << t_preprocessing.stop() << " s";
OpmLog::info(out.str());
}
// actual ILU decomposition
Timer t_decomposition;
#if CHOW_PATEL_GPU
chowPatelIlu.decomposition(queue, context,
Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
@ -396,10 +411,18 @@ namespace bda
delete[] Ltmp;
#endif
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
Timer t_postprocessing;
// convert Ut to BSR
// diagonal stored separately
std::vector<int> ptr(Nb+1, 0);
std::vector<int> col(Ut->rowPointers[Nb]);
std::vector<int> cols(Ut->rowPointers[Nb]);
// count blocks per row for U (BSR)
// store diagonal in invDiagVals
@ -423,7 +446,7 @@ namespace bda
int j = Ut->colIndices[k];
if (j != i) {
int head = ptr[j]++;
col[head] = i;
cols[head] = i;
memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
}
}
@ -433,21 +456,55 @@ namespace bda
std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
ptr.front() = 0;
// reversing the rows of U, because that is the order they are used in during ILU apply
int URowIndex = 0;
int offsetU = 0; // number of nnz blocks that are already copied to Umat
Umat->rowPointers[0] = 0;
for (int i = LUmat->Nb - 1; i >= 0; i--) {
int rowSize = ptr[i + 1] - ptr[i]; // number of blocks in this row
memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
offsetU += rowSize;
Umat->rowPointers[URowIndex + 1] = offsetU;
URowIndex++;
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel postprocessing: " << t_postprocessing.stop() << " s";
OpmLog::info(out.str());
}
Timer t_copyToGpu;
events.resize(3);
queue->enqueueWriteBuffer(s.Lvals, CL_FALSE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues, nullptr, &events[0]);
queue->enqueueWriteBuffer(s.Uvals, CL_FALSE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp, nullptr, &events[1]);
queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals, nullptr, &events[2]);
std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(8);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]);
queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
queue->enqueueWriteBuffer(s.Urows, CL_FALSE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data(), nullptr, &events[7]);
});
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel copy to GPU: " << t_copyToGpu.stop() << " s\n";
out << "BILU0 ChowPatel total: " << t_total.stop() << " s";
OpmLog::info(out.str());
}
delete[] Utmp;
#endif // CHOW_PATEL
}
template <unsigned int block_size>
@ -479,118 +536,68 @@ namespace bda
OpmLog::info(out.str());
}
Timer t_decomposition;
#if CHOW_PATEL
chow_patel_decomposition();
#else
int i, j, ij, ik, jk;
int iRowStart, iRowEnd, jRowEnd;
double pivot[bs * bs];
int LSize = 0;
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
// go through all rows
for (i = 0; i < LUmat->Nb; i++) {
iRowStart = LUmat->rowPointers[i];
iRowEnd = LUmat->rowPointers[i + 1];
// go through all elements of the row
for (ij = iRowStart; ij < iRowEnd; ij++) {
j = LUmat->colIndices[ij];
// if the element is the diagonal, store the index and go to next row
if (j == i) {
diagIndex[i] = ij;
break;
}
// if an element beyond the diagonal is reach, no diagonal was found
// throw an error now. TODO: perform reordering earlier to prevent this
if (j > i) {
std::ostringstream out;
out << "BILU0 Error could not find diagonal value in row: " << i;
OpmLog::error(out.str());
return false;
}
LSize++;
// calculate the pivot of this row
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size);
jRowEnd = LUmat->rowPointers[j + 1];
jk = diagIndex[j] + 1;
ik = ij + 1;
// substract that row scaled by the pivot from this row.
while (ik < iRowEnd && jk < jRowEnd) {
if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) {
blockMultSub<bs>(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs);
ik++;
jk++;
} else {
if (LUmat->colIndices[ik] < LUmat->colIndices[jk])
{ ik++; }
else
{ jk++; }
}
}
}
// store the inverse in the diagonal!
inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
}
Lmat->rowPointers[0] = 0;
Umat->rowPointers[0] = 0;
// Split the LU matrix into two by comparing column indices to diagonal indices
for (i = 0; i < LUmat->Nb; i++) {
int offsetL = Lmat->rowPointers[i];
int rowSize = diagIndex[i] - LUmat->rowPointers[i];
int offsetLU = LUmat->rowPointers[i];
memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
offsetL += rowSize;
Lmat->rowPointers[i + 1] = offsetL;
}
// Reverse the order or the (blocked) rows for the U matrix,
// because the rows are accessed in reverse order when applying the ILU0
int URowIndex = 0;
for (i = LUmat->Nb - 1; i >= 0; i--) {
int offsetU = Umat->rowPointers[URowIndex];
int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1;
int offsetLU = diagIndex[i] + 1;
memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
offsetU += rowSize;
Umat->rowPointers[URowIndex + 1] = offsetU;
URowIndex++;
}
#endif
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
Timer t_copyToGpu;
if (pattern_uploaded == false) {
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
pattern_uploaded = true;
events.resize(1);
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(4);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]);
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
});
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals);
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
OpmLog::info(out.str());
}
Timer t_decomposition;
std::ostringstream out;
cl::Event event;
for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color+1];
const unsigned int work_group_size2 = 128;
const unsigned int num_work_groups2 = 1024;
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot
if (verbosity >= 4) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
event.wait();
}
if (verbosity >= 3) {
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
#endif // CHOW_PATEL
return true;
} // end create_preconditioner()
@ -604,16 +611,24 @@ namespace bda
Timer t_apply;
for(int color = 0; color < numColors; ++color){
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#if CHOW_PATEL
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#else
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#endif
// event.wait();
}
for(int color = numColors-1; color >= 0; --color){
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#if CHOW_PATEL
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#else
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#endif
// event.wait();
}
if (verbosity >= 3) {
if (verbosity >= 4) {
event.wait();
std::ostringstream out;
out << "BILU0 apply: " << t_apply.stop() << " s";
@ -638,11 +653,13 @@ namespace bda
}
template <unsigned int block_size>
void BILU0<block_size>::setKernels(
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
){
this->ILU_apply1 = ILU_apply1_;
this->ILU_apply2 = ILU_apply2_;
this->ilu_decomp_k = ilu_decomp_k_;
}
@ -657,8 +674,9 @@ template void BILU0<n>::setOpenCLContext(cl::Context*);
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
template void BILU0<n>::setKernels( \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> * \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
);
INSTANTIATE_BDA_FUNCTIONS(1);

View File

@ -20,12 +20,27 @@
#ifndef BILU0_HPP
#define BILU0_HPP
#include <mutex>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
// the exact decomposition uses a full matrix LU which is the superposition of L and U
// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
#define CHOW_PATEL 0
#define CHOW_PATEL_GPU 1
namespace bda
{
@ -40,33 +55,48 @@ namespace bda
int Nb; // number of blockrows of the matrix
int nnz; // number of nonzeroes of the matrix (scalar)
int nnzbs; // number of blocks of the matrix
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr, LUmat = nullptr;
std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
#if CHOW_PATEL
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
#endif
double *invDiagVals = nullptr;
int *diagIndex = nullptr;
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
int *toOrder = nullptr, *fromOrder = nullptr;
int numColors;
int verbosity;
std::once_flag pattern_uploaded;
ILUReorder opencl_ilu_reorder;
typedef struct {
cl::Buffer Lvals, Uvals, invDiagVals;
cl::Buffer Lcols, Lrows;
cl::Buffer Ucols, Urows;
cl::Buffer invDiagVals;
cl::Buffer diagIndex;
cl::Buffer rowsPerColor;
#if CHOW_PATEL
cl::Buffer Lvals, Lcols, Lrows;
cl::Buffer Uvals, Ucols, Urows;
#else
cl::Buffer LUvals, LUcols, LUrows;
#endif
} GPU_storage;
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
ilu_apply1_kernel_type *ILU_apply1;
ilu_apply2_kernel_type *ILU_apply2;
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg> *ilu_decomp_k;
GPU_storage s;
cl::Context *context;
cl::CommandQueue *queue;
std::vector<cl::Event> events;
cl_int err;
int work_group_size = 0;
int total_work_items = 0;
int lmem_per_work_group = 0;
bool pattern_uploaded = false;
ChowPatelIlu chowPatelIlu;
@ -91,8 +121,9 @@ namespace bda
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
void setKernels(
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k
);
int* getToOrder()

View File

@ -28,6 +28,19 @@
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#if HAVE_CUDA
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
#endif
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
#endif
#if HAVE_FPGA
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
#endif
#define PRINT_TIMERS_BRIDGE 0
typedef Dune::InverseOperatorResult InverseOperatorResult;
@ -41,7 +54,8 @@ namespace Opm
using bda::ILUReorder;
template <class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
: gpu_mode(gpu_mode_)
{
if (gpu_mode.compare("cusparse") == 0) {
#if HAVE_CUDA
@ -224,6 +238,19 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector
}
}
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
if(gpu_mode.compare("opencl") == 0){
#if HAVE_OPENCL
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
@ -241,6 +268,11 @@ template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::al
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
n>::get_result \
(Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >&); \
\
template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
n>::initWellContributions(WellContributions&)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);

View File

@ -25,16 +25,10 @@
#include "dune/istl/bcrsmatrix.hh"
#include <opm/simulators/linalg/matrixblock.hh>
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#if HAVE_CUDA
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
#endif
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
#endif
namespace Opm
{
@ -48,6 +42,7 @@ class BdaBridge
{
private:
bool use_gpu = false;
std::string gpu_mode;
std::unique_ptr<bda::BdaSolver<block_size> > backend;
public:
@ -79,9 +74,11 @@ public:
return use_gpu;
}
const bda::BdaSolver<block_size>& getBackend() const {
return *backend;
}
/// Initialize the WellContributions object with opencl context and queue
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
/// \param[in] wellContribs container to hold all WellContributions
void initWellContributions(WellContributions& wellContribs);
}; // end class BdaBridge

View File

@ -529,7 +529,7 @@ void ChowPatelIlu::decomposition(
err |= queue->enqueueWriteBuffer(d_LU_cols, CL_FALSE, 0, sizeof(int) * LU_nnzbs, LU_cols, nullptr, &events[5]);
cl::WaitForEvents(events);
events.clear();
if (verbosity >= 3){
if (verbosity >= 4){
std::ostringstream out;
out << "ChowPatelIlu copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
OpmLog::info(out.str());
@ -550,7 +550,7 @@ void ChowPatelIlu::decomposition(
err |= queue->enqueueWriteBuffer(d_LU_vals, CL_FALSE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
if (verbosity >= 3){
if (verbosity >= 4){
std::ostringstream out;
out << "ChowPatelIlu copy1 time: " << t_copy1.stop() << " s";
OpmLog::info(out.str());
@ -585,7 +585,7 @@ void ChowPatelIlu::decomposition(
d_Ut_idxs, d_L_cols, d_LU_cols,
*Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group));
event.wait();
if (verbosity >= 3){
if (verbosity >= 4){
std::ostringstream out;
out << "ChowPatelIlu sweep kernel time: " << t_kernel.stop() << " s";
OpmLog::info(out.str());
@ -604,7 +604,7 @@ void ChowPatelIlu::decomposition(
}
cl::WaitForEvents(events);
events.clear();
if (verbosity >= 3){
if (verbosity >= 4){
std::ostringstream out;
out << "ChowPatelIlu copy2 time: " << t_copy2.stop() << " s";
OpmLog::info(out.str());

View File

@ -201,11 +201,10 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
int reordered = 0;
int i, c;
// Find reordering patterns
for (c = 0; c < numColors; c++) {
for (i = 0; i < Nb; i++) {
for (int c = 0; c < numColors; c++) {
for (int i = 0; i < Nb; i++) {
if (colors[i] == c) {
rowsPerColor[c]++;
toOrder[i] = reordered;
@ -259,10 +258,11 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
int activeRowIndex = 0, colorEnd, nextActiveRowIndex = 0;
int thisRow;
std::vector<bool> doneRows(Nb, false);
rowsPerColor.reserve(Nb);
std::vector <int> rowsToStart;
// since emplace_back() is used to fill, the vector must be empty
assert(rowsPerColor.size() == 0);
// find starting rows: rows that are independent from all rows that come before them.
for (thisRow = 0; thisRow < Nb; thisRow++) {
if (canBeStarted(thisRow, CSCColPointers, CSCRowIndices, doneRows)) {
@ -312,8 +312,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
template <unsigned int block_size>
void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
std::vector<int> rowColor;
rowColor.resize(Nb);
std::vector<int> rowColor(Nb);
*numColors = colorBlockedNodes<block_size>(Nb, CSRRowPointers, CSRColIndices, CSCColPointers, CSCRowIndices, rowColor, maxRowsPerColor, maxColsPerColor);

View File

@ -90,7 +90,7 @@ bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIn
/// \param[out] numColors a pointer to the number of colors needed for the level scheduling
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill
void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);
/// Find a graph coloring reordering for an input matrix
@ -105,7 +105,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
/// \param[out] numColors the number of colors used in the found graph coloring
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function will resize()
template <unsigned int block_size>
void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor);

View File

@ -22,7 +22,7 @@
#include <cstring>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm
@ -74,7 +74,7 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
this->queue = queue_;
}
void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
void WellContributions::setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}

View File

@ -26,6 +26,7 @@
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#endif
#include <vector>
@ -39,6 +40,9 @@
namespace Opm
{
using bda::stdwell_apply_kernel_type;
using bda::stdwell_apply_no_reorder_kernel_type;
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
/// So far, StandardWell and MultisegmentWell are supported
@ -92,17 +96,10 @@ private:
std::vector<MultisegmentWellContribution*> multisegments;
#if HAVE_OPENCL
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
cl::Context *context;
cl::CommandQueue *queue;
kernel_type *kernel;
kernel_type_no_reorder *kernel_no_reorder;
stdwell_apply_kernel_type *kernel;
stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
@ -154,7 +151,7 @@ public:
#endif
#if HAVE_OPENCL
void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
void setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect

View File

@ -0,0 +1,621 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <opm/simulators/linalg/bda/openclKernels.hpp>
namespace bda
{
std::string get_axpy_string() {
return R"(
__kernel void axpy(
__global double *in,
const double a,
__global double *out,
const int N)
{
unsigned int NUM_THREADS = get_global_size(0);
int idx = get_global_id(0);
while(idx < N){
out[idx] += a * in[idx];
idx += NUM_THREADS;
}
}
)";
}
// returns partial sums, instead of the final dot product
std::string get_dot_1_string() {
return R"(
__kernel void dot_1(
__global double *in1,
__global double *in2,
__global double *out,
const unsigned int N,
__local double *tmp)
{
unsigned int tid = get_local_id(0);
unsigned int bsize = get_local_size(0);
unsigned int bid = get_global_id(0) / bsize;
unsigned int i = get_global_id(0);
unsigned int NUM_THREADS = get_global_size(0);
double sum = 0.0;
while(i < N){
sum += in1[i] * in2[i];
i += NUM_THREADS;
}
tmp[tid] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
{
if (tid < s)
{
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) out[get_group_id(0)] = tmp[0];
}
)";
}
// returns partial sums, instead of the final norm
// the square root must be computed on CPU
std::string get_norm_string() {
return R"(
__kernel void norm(
__global double *in,
__global double *out,
const unsigned int N,
__local double *tmp)
{
unsigned int tid = get_local_id(0);
unsigned int bsize = get_local_size(0);
unsigned int bid = get_global_id(0) / bsize;
unsigned int i = get_global_id(0);
unsigned int NUM_THREADS = get_global_size(0);
double local_sum = 0.0;
while(i < N){
local_sum += in[i] * in[i];
i += NUM_THREADS;
}
tmp[tid] = local_sum;
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
{
if (tid < s)
{
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) out[get_group_id(0)] = tmp[0];
}
)";
}
// p = (p - omega * v) * beta + r
std::string get_custom_string() {
return R"(
__kernel void custom(
__global double *p,
__global double *v,
__global double *r,
const double omega,
const double beta,
const int N)
{
const unsigned int NUM_THREADS = get_global_size(0);
unsigned int idx = get_global_id(0);
while(idx < N){
double res = p[idx];
res -= omega * v[idx];
res *= beta;
res += r[idx];
p[idx] = res;
idx += NUM_THREADS;
}
}
)";
}
std::string get_spmv_blocked_string() {
return R"(
__kernel void spmv_blocked(
__global const double *vals,
__global const int *cols,
__global const int *rows,
const int Nb,
__global const double *x,
__global double *b,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bsize = get_local_size(0);
const unsigned int idx_b = get_global_id(0) / bsize;
const unsigned int idx_t = get_local_id(0);
unsigned int idx = idx_b * bsize + idx_t;
const unsigned int bs = block_size;
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int target_block_row = idx / warpsize;
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
// for 3x3 blocks:
// num_active_threads: 27
// num_blocks_per_warp: 3
while(target_block_row < Nb){
unsigned int first_block = rows[target_block_row];
unsigned int last_block = rows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
for(; block < last_block; block += num_blocks_per_warp){
double x_elem = x[cols[block]*bs + c];
double A_elem = vals[block*bs*bs + c + r*bs];
local_out += x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lane < bs){
unsigned int row = target_block_row*bs + lane;
b[row] = tmp[lane];
}
target_block_row += num_warps_in_grid;
}
}
)";
}
std::string get_ILU_apply1_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply1(
__global const double *LUvals,
__global const unsigned int *LUcols,
__global const unsigned int *LUrows,
__global const int *diagIndex,
__global const double *y,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row];
)";
if (full_matrix) {
s += "const unsigned int last_block = diagIndex[target_block_row]; ";
} else {
s += "const unsigned int last_block = LUrows[target_block_row+1]; ";
}
s += R"(
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
local_out = y[target_block_row*bs+lane];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c];
const double A_elem = LUvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lane < bs){
const unsigned int row = target_block_row*bs + lane;
x[row] = tmp[lane];
}
target_block_row += num_warps_in_grid;
}
}
)";
return s;
}
std::string get_ILU_apply2_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply2(
__global const double *LUvals,
__global const int *LUcols,
__global const int *LUrows,
__global const int *diagIndex,
__global const double *invDiagVals,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
const double relaxation = 0.9;
while(target_block_row < nodesPerColorPrefix[color+1]){
)";
if (full_matrix) {
s += "const unsigned int first_block = diagIndex[target_block_row] + 1; ";
} else {
s += "const unsigned int first_block = LUrows[target_block_row]; ";
}
s += R"(
const unsigned int last_block = LUrows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
const unsigned int row = target_block_row*bs+lane;
local_out = x[row];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c];
const double A_elem = LUvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
local_out = tmp[lane];
if(lane < bs){
tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0;
for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
}
const unsigned int row = target_block_row*bs + lane;
x[row] = relaxation * sum;
}
target_block_row += num_warps_in_grid;
}
}
)";
return s;
}
std::string get_stdwell_apply_string(bool reorder) {
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
std::string s = "__kernel void " + kernel_name + R"((
__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
)";
if (reorder) {
s += R"(__global const int *toOrder,
)";
}
s += R"(const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
)";
if (reorder) {
s += "int colIdx = toOrder[Bcols[b]]; ";
} else {
s += "int colIdx = Bcols[b]; ";
}
s += R"(
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
}
z1[r] = localSum[wiId];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
)";
if (reorder) {
s += "int colIdx = toOrder[Ccols[bb]]; ";
} else {
s += "int colIdx = Ccols[bb]; ";
}
s += R"(
y[colIdx*dim + c] -= temp;
}
}
)";
return s;
}
std::string get_ilu_decomp_string() {
return R"(
// a = a - (b * c)
__kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
{
const unsigned int block_size = 3;
const unsigned int hwarp_size = 16;
const unsigned int idx_t = get_local_id(0); // thread id in work group
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
if(thread_id_in_hwarp < block_size * block_size){
const unsigned int row = thread_id_in_hwarp / block_size;
const unsigned int col = thread_id_in_hwarp % block_size;
double temp = 0.0;
for (unsigned int k = 0; k < block_size; k++) {
temp += b[block_size * row + k] * c[block_size * k + col];
}
a[block_size * row + col] -= temp;
}
}
// c = a * b
__kernel void block_mult(__global double *a, __global double *b, __local double *c) {
const unsigned int block_size = 3;
const unsigned int hwarp_size = 16;
const unsigned int idx_t = get_local_id(0); // thread id in work group
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
if(thread_id_in_hwarp < block_size * block_size){
const unsigned int row = thread_id_in_hwarp / block_size;
const unsigned int col = thread_id_in_hwarp % block_size;
double temp = 0.0;
for (unsigned int k = 0; k < block_size; k++) {
temp += a[block_size * row + k] * b[block_size * k + col];
}
c[block_size * row + col] = temp;
}
}
// invert 3x3 matrix
__kernel void inverter(__global double *matrix, __global double *inverse) {
const unsigned int block_size = 3;
const unsigned int bs = block_size; // rename to shorter name
const unsigned int hwarp_size = 16;
const unsigned int idx_t = get_local_id(0); // thread id in work group
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
if(thread_id_in_hwarp < bs * bs){
double t4 = matrix[0] * matrix[4];
double t6 = matrix[0] * matrix[5];
double t8 = matrix[1] * matrix[3];
double t10 = matrix[2] * matrix[3];
double t12 = matrix[1] * matrix[6];
double t14 = matrix[2] * matrix[6];
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
double t17 = 1.0 / det;
const unsigned int r = thread_id_in_hwarp / bs;
const unsigned int c = thread_id_in_hwarp % bs;
const unsigned int r1 = (r+1) % bs;
const unsigned int c1 = (c+1) % bs;
const unsigned int r2 = (r+bs-1) % bs;
const unsigned int c2 = (c+bs-1) % bs;
inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
}
}
__kernel void ilu_decomp(const unsigned int firstRow,
const unsigned int lastRow,
__global double *LUvals,
__global const int *LUcols,
__global const int *LUrows,
__global double *invDiagVals,
__global int *diagIndex,
const unsigned int Nb,
__local double *pivot){
const unsigned int bs = 3;
const unsigned int hwarp_size = 16;
const unsigned int work_group_size = get_local_size(0);
const unsigned int work_group_id = get_group_id(0);
const unsigned int num_groups = get_num_groups(0);
const unsigned int hwarps_per_group = work_group_size / hwarp_size;
const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group
const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads)
const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it
// every workitem in a hwarp has the same lmem_offset
// go through all rows
for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
{
int iRowStart = LUrows[i];
int iRowEnd = LUrows[i + 1];
// go through all elements of the row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUcols[ij];
if (j < i) {
// calculate the pivot of this row
block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
// copy pivot
if (thread_id_in_hwarp < bs * bs) {
LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
}
int jRowEnd = LUrows[j + 1];
int jk = diagIndex[j] + 1;
int ik = ij + 1;
// substract that row scaled by the pivot from this row.
while (ik < iRowEnd && jk < jRowEnd) {
if (LUcols[ik] == LUcols[jk]) {
block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
ik++;
jk++;
} else {
if (LUcols[ik] < LUcols[jk])
{ ik++; }
else
{ jk++; }
}
}
}
}
// store the inverse in the diagonal
inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
// copy inverse
if (thread_id_in_hwarp < bs * bs) {
LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
}
}
}
)";
}
} // end namespace bda

View File

@ -20,498 +20,76 @@
#ifndef OPENCL_HPP
#define OPENCL_HPP
#include <string>
#include <opm/simulators/linalg/bda/opencl.hpp>
namespace bda
{
inline const char* axpy_s = R"(
__kernel void axpy(
__global double *in,
const double a,
__global double *out,
const int N)
{
unsigned int NUM_THREADS = get_global_size(0);
int idx = get_global_id(0);
using spmv_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply1_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply2_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using stdwell_apply_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
using stdwell_apply_no_reorder_kernel_type = cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
using ilu_decomp_kernel_type = cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
while(idx < N){
out[idx] += a * in[idx];
idx += NUM_THREADS;
}
}
)";
/// Generate string with axpy kernel
/// a = a + alpha * b
std::string get_axpy_string();
/// returns partial sums, instead of the final dot product
/// partial sums are added on CPU
std::string get_dot_1_string();
// returns partial sums, instead of the final dot product
inline const char* dot_1_s = R"(
__kernel void dot_1(
__global double *in1,
__global double *in2,
__global double *out,
const unsigned int N,
__local double *tmp)
{
unsigned int tid = get_local_id(0);
unsigned int bsize = get_local_size(0);
unsigned int bid = get_global_id(0) / bsize;
unsigned int i = get_global_id(0);
unsigned int NUM_THREADS = get_global_size(0);
/// returns partial sums, instead of the final norm
/// the square root must be computed on CPU
std::string get_norm_string();
double sum = 0.0;
while(i < N){
sum += in1[i] * in2[i];
i += NUM_THREADS;
}
tmp[tid] = sum;
/// Generate string with custom kernel
/// This kernel combines some ilubicgstab vector operations into 1
/// p = (p - omega * v) * beta + r
std::string get_custom_string();
barrier(CLK_LOCAL_MEM_FENCE);
/// b = mat * x
/// algorithm based on:
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
std::string get_spmv_blocked_string();
// do reduction in shared mem
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
{
if (tid < s)
{
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
/// ILU apply part 1: forward substitution
/// solves L*x=y where L is a lower triangular sparse blocked matrix
/// this L can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
std::string get_ILU_apply1_string(bool full_matrix);
// write result for this block to global mem
if (tid == 0) out[get_group_id(0)] = tmp[0];
}
)";
/// ILU apply part 2: backward substitution
/// solves U*x=y where U is an upper triangular sparse blocked matrix
/// this U can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
std::string get_ILU_apply2_string(bool full_matrix);
/// Generate string with the stdwell_apply kernels
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
/// the x/y vector is reordered, use toOrder to address that
/// \param[in] reorder whether the matrix is reordered or not
std::string get_stdwell_apply_string(bool reorder);
// returns partial sums, instead of the final norm
// the square root must be computed on CPU
inline const char* norm_s = R"(
__kernel void norm(
__global double *in,
__global double *out,
const unsigned int N,
__local double *tmp)
{
unsigned int tid = get_local_id(0);
unsigned int bsize = get_local_size(0);
unsigned int bid = get_global_id(0) / bsize;
unsigned int i = get_global_id(0);
unsigned int NUM_THREADS = get_global_size(0);
double local_sum = 0.0;
while(i < N){
local_sum += in[i] * in[i];
i += NUM_THREADS;
}
tmp[tid] = local_sum;
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
{
if (tid < s)
{
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) out[get_group_id(0)] = tmp[0];
}
)";
// p = (p - omega * v) * beta + r
inline const char* custom_s = R"(
__kernel void custom(
__global double *p,
__global double *v,
__global double *r,
const double omega,
const double beta,
const int N)
{
const unsigned int NUM_THREADS = get_global_size(0);
unsigned int idx = get_global_id(0);
while(idx < N){
double res = p[idx];
res -= omega * v[idx];
res *= beta;
res += r[idx];
p[idx] = res;
idx += NUM_THREADS;
}
}
)";
// b = mat * x
// algorithm based on:
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
inline const char* spmv_blocked_s = R"(
__kernel void spmv_blocked(
__global const double *vals,
__global const int *cols,
__global const int *rows,
const int Nb,
__global const double *x,
__global double *b,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bsize = get_local_size(0);
const unsigned int idx_b = get_global_id(0) / bsize;
const unsigned int idx_t = get_local_id(0);
unsigned int idx = idx_b * bsize + idx_t;
const unsigned int bs = block_size;
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int target_block_row = idx / warpsize;
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
// for 3x3 blocks:
// num_active_threads: 27
// num_blocks_per_warp: 3
while(target_block_row < Nb){
unsigned int first_block = rows[target_block_row];
unsigned int last_block = rows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
for(; block < last_block; block += num_blocks_per_warp){
double x_elem = x[cols[block]*bs + c];
double A_elem = vals[block*bs*bs + c + r*bs];
local_out += x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lane < bs){
unsigned int row = target_block_row*bs + lane;
b[row] = tmp[lane];
}
target_block_row += num_warps_in_grid;
}
}
)";
// ILU apply part 1: forward substitution
// solves L*x=y where L is a lower triangular sparse blocked matrix
inline const char* ILU_apply1_s = R"(
__kernel void ILU_apply1(
__global const double *Lvals,
__global const unsigned int *Lcols,
__global const unsigned int *Lrows,
const unsigned int LrowSize,
__global const double *y,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
idx += nodesPerColorPrefix[color];
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = Lrows[target_block_row];
const unsigned int last_block = Lrows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
local_out = y[target_block_row*bs+lane];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[Lcols[block]*bs + c];
const double A_elem = Lvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lane < bs){
const unsigned int row = target_block_row*bs + lane;
x[row] = tmp[lane];
}
target_block_row += num_warps_in_grid;
}
}
)";
// ILU apply part 2: backward substitution
// solves U*x=y where L is a lower triangular sparse blocked matrix
inline const char* ILU_apply2_s = R"(
__kernel void ILU_apply2(
__global const double *Uvals,
__global const int *Ucols,
__global const int *Urows,
const unsigned int UrowSize,
__global const double *invDiagVals,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
idx += nodesPerColorPrefix[color];
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
const double relaxation = 0.9;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = Urows[UrowSize-target_block_row-1];
const unsigned int last_block = Urows[UrowSize-target_block_row];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
const unsigned int row = target_block_row*bs+lane;
local_out = x[row];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[Ucols[block]*bs + c];
const double A_elem = Uvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
local_out = tmp[lane];
if(lane < bs){
tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0;
for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
}
const unsigned int row = target_block_row*bs + lane;
x[row] = relaxation * sum;
}
target_block_row += num_warps_in_grid;
}
}
)";
inline const char* stdwell_apply_s = R"(
__kernel void stdwell_apply(__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
__global const int *toOrder,
const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
int colIdx = toOrder[Bcols[b]];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
}
z1[r] = localSum[wiId];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
int colIdx = toOrder[Ccols[bb]];
y[colIdx*dim + c] -= temp;
}
}
)";
inline const char* stdwell_apply_no_reorder_s = R"(
__kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
int colIdx = Bcols[b];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
}
z1[r] = localSum[wiId];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
int colIdx = Ccols[bb];
y[colIdx*dim + c] -= temp;
}
}
)";
/// Generate string with the exact ilu decomposition kernel
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
std::string get_ilu_decomp_string();
} // end namespace bda

View File

@ -26,7 +26,6 @@
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
@ -46,7 +45,6 @@ template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
cl_int err = CL_SUCCESS;
std::ostringstream out;
try {
std::vector<cl::Platform> platforms;
@ -332,20 +330,23 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
//applyblockedscaleadd(-1.0, mat, x, r);
// set initial values
cl::Event event;
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
events.resize(5);
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
rho = 1.0;
alpha = 1.0;
omega = 1.0;
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
}
norm = norm_w(d_r, d_tmp);
norm_0 = norm;
@ -475,18 +476,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
out.clear();
try {
cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
program = cl::Program(*context, source);
program.build(devices);
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
@ -518,27 +507,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
get_opencl_kernels();
// actually creating the kernels
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
@ -554,6 +525,59 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
initialized = true;
} // end initialize()
void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
sources.emplace_back(std::make_pair(source.c_str(), source.size()));
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::get_opencl_kernels() {
cl::Program::Sources sources;
std::string axpy_s = get_axpy_string();
add_kernel_string(sources, axpy_s);
std::string dot_1_s = get_dot_1_string();
add_kernel_string(sources, dot_1_s);
std::string norm_s = get_norm_string();
add_kernel_string(sources, norm_s);
std::string custom_s = get_custom_string();
add_kernel_string(sources, custom_s);
std::string spmv_blocked_s = get_spmv_blocked_string();
add_kernel_string(sources, spmv_blocked_s);
#if CHOW_PATEL
bool ilu_operate_on_full_matrix = false;
#else
bool ilu_operate_on_full_matrix = true;
#endif
std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply1_s);
std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply2_s);
std::string stdwell_apply_s = get_stdwell_apply_string(true);
add_kernel_string(sources, stdwell_apply_s);
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
add_kernel_string(sources, stdwell_apply_no_reorder_s);
std::string ilu_decomp_s = get_ilu_decomp_string();
add_kernel_string(sources, ilu_decomp_s);
cl::Program program = cl::Program(*context, sources);
program.build(devices);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
// actually creating the kernels
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
} // end get_opencl_kernels()
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
@ -569,7 +593,7 @@ void openclSolverBackend<block_size>::finalize() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t;
cl::Event event;
events.resize(5);
#if COPY_ROW_BY_ROW
int sum = 0;
@ -578,19 +602,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
#else
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
#endif
queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
if (opencl_ilu_reorder != ILUReorder::NONE) {
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
events.resize(6);
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]);
}
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
}
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
if (verbosity > 2) {
std::ostringstream out;
@ -603,7 +633,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() {
Timer t;
cl::Event event;
events.resize(3);
#if COPY_ROW_BY_ROW
int sum = 0;
@ -612,14 +642,19 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
#else
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
#endif
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
}
if (verbosity > 2) {
std::ostringstream out;

View File

@ -21,6 +21,7 @@
#define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
@ -64,22 +65,16 @@ private:
// shared pointers are also passed to other objects
std::vector<cl::Device> devices;
cl::Program program;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
std::shared_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
std::shared_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
std::shared_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
std::shared_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
@ -87,6 +82,8 @@ private:
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
ILUReorder opencl_ilu_reorder; // reordering strategy
std::vector<cl::Event> events;
cl_int err;
/// Divide A by B, and round up: return (int)ceil(A/B)
/// \param[in] A dividend
@ -147,6 +144,9 @@ private:
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
/// Generate and compile opencl kernels
void get_opencl_kernels();
/// Clean memory
void finalize();