Merge pull request #3619 from Tongdongq/opencl-refactor

Opencl refactor
This commit is contained in:
Markus Blatt 2021-11-08 16:45:57 +01:00 committed by GitHub
commit ad44564f27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 1124 additions and 829 deletions

View File

@ -26,6 +26,9 @@ option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
option(USE_CHOW_PATEL_ILU "Use the iterative ILU by Chow and Patel?" OFF)
option(USE_CHOW_PATEL_ILU_GPU "Run iterative ILU decomposition on GPU? Requires USE_CHOW_PATEL_ILU" OFF)
option(USE_CHOW_PATEL_ILU_GPU_PARALLEL "Try to use more parallelism on the GPU during the iterative ILU decomposition? Requires USE_CHOW_PATEL_ILU_GPU_PARALLEL" OFF)
if(SIBLING_SEARCH AND NOT opm-common_DIR)
# guess the sibling dir
@ -211,6 +214,24 @@ if(OpenCL_FOUND)
set(OpenCL_FOUND OFF)
set(OPENCL_FOUND OFF)
endif()
if(USE_CHOW_PATEL_ILU)
add_compile_options(-DCHOW_PATEL=1)
if(USE_CHOW_PATEL_ILU_GPU)
add_compile_options(-DCHOW_PATEL_GPU=1)
if(USE_CHOW_PATEL_ILU_GPU_PARALLEL)
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=1)
else()
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
endif()
else()
add_compile_options(-DCHOW_PATEL_GPU=0)
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
endif()
endif()
else()
if(USE_CHOW_PATEL_ILU)
message(FATAL_ERROR " CHOW_PATEL_ILU only works for openclSolver, but OpenCL was not found")
endif()
endif()
find_package(amgcl)
@ -242,6 +263,7 @@ if(OpenCL_FOUND)
endif()
endif()
# read the list of components from this file (in the project directory);
# it should set various lists with the names of the files to include
include (CMakeLists_files.cmake)

View File

@ -18,10 +18,10 @@
*/
#include <config.h>
#include <algorithm>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/MatrixBlock.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
@ -30,7 +30,9 @@
#include <opm/simulators/linalg/bda/Reorder.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -39,7 +41,11 @@ using Dune::Timer;
template <unsigned int block_size>
BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
{}
{
#if CHOW_PATEL
chowPatelIlu.setVerbosity(verbosity);
#endif
}
template <unsigned int block_size>
BILU0<block_size>::~BILU0()
@ -158,350 +164,6 @@ BILU0<block_size>::~BILU0()
} // end init()
// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
template <unsigned int block_size>
void BILU0<block_size>::chow_patel_decomposition()
{
#if CHOW_PATEL
const unsigned int bs = block_size;
int num_sweeps = 6;
// split matrix into L and U
// also convert U into BSC format (Ut)
// Ut stores diagonal for now
// original matrix LUmat is assumed to be symmetric
#ifndef NDEBUG
// verify that matrix is symmetric
for (int i = 0; i < Nb; ++i){
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block (i, j) in this row, check if (j, i) also exists
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
int jRowStart = LUmat->rowPointers[j];
int jRowEnd = LUmat->rowPointers[j + 1];
bool blockFound = false;
// check all blocks on row j
// binary search is possible
for (int ji = jRowStart; ji < jRowEnd; ji++) {
int row = LUmat->colIndices[ji];
if (i == row) {
blockFound = true;
break;
}
}
if (false == blockFound) {
OPM_THROW(std::logic_error, "Error sparsity pattern must be symmetric when using chow_patel_decomposition()");
}
}
}
#endif
Timer t_total, t_preprocessing;
// Ut is actually BSC format
std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
Lmat->rowPointers[0] = 0;
for (int i = 0; i < Nb+1; i++) {
Ut->rowPointers[i] = 0;
}
Opm::Detail::Inverter<bs> inverter;
// store inverted diagonal
for (int i = 0; i < Nb; i++) {
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block in this row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
if (i == j) {
inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
}
}
}
// initialize initial guess for L: L_A * D
// L_A is strictly lower triangular part of A
// D is inv(diag(A))
int num_blocks_L = 0;
for (int i = 0; i < Nb; i++) {
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block in this row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
if (i <= j) {
Ut->rowPointers[j+1]++; // actually colPointers, now simply indicates how many blocks this col holds
} else {
Lmat->colIndices[num_blocks_L] = j;
// multiply block of L with corresponding diag block
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
num_blocks_L++;
}
}
// TODO: copy all blocks for L at once, instead of copying each block individually
Lmat->rowPointers[i+1] = num_blocks_L;
}
// prefix sum to sum rowsizes into colpointers
std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
// initialize initial guess for U
for (int i = 0; i < Nb; i++) {
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block in this row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
if (i <= j){
int idx = Ut->rowPointers[j]++; // rowPointers[i] is (mis)used as the write offset of the current row i
Ut->colIndices[idx] = i; // actually rowIndices
memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
}
}
}
// rotate
// the Ut->rowPointers were increased in the last loop
// now Ut->rowPointers[i+1] is at the same position as Ut->rowPointers[i] should have for a crs matrix. reset to correct expected value
for (int i = Nb; i > 0; --i) {
Ut->rowPointers[i] = Ut->rowPointers[i-1];
}
Ut->rowPointers[0] = 0;
// Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition
// U will be reversed because it is used with backwards substitution, the last row is used first
// Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
double *Utmp = new double[Ut->nnzbs * block_size * block_size];
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 ChowPatel preprocessing: " << t_preprocessing.stop() << " s";
OpmLog::info(out.str());
}
// actual ILU decomposition
Timer t_decomposition;
#if CHOW_PATEL_GPU
chowPatelIlu.decomposition(queue, context,
Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
Nb, num_sweeps, verbosity);
#else
double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
for (int sweep = 0; sweep < num_sweeps; ++sweep) {
// algorithm
// for every block in A (LUmat):
// if i > j:
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
// else:
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
// for every row
for (int row = 0; row < Nb; row++) {
// update U
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
int jColStart = Ut->rowPointers[row];
int jColEnd = Ut->rowPointers[row + 1];
int colU = row; // rename for clarity, next row in Ut means next col in U
// for every block in this row
for (int ij = jColStart; ij < jColEnd; ij++) {
int rowU1 = Ut->colIndices[ij]; // actually rowIndices for U
// refine Uij element (or diagonal)
int i1 = LUmat->rowPointers[rowU1];
int i2 = LUmat->rowPointers[rowU1+1];
// search on row rowU1, find blockIndex in LUmat of block with same col (colU) as Uij
// LUmat->nnzValues[kk] is block Aij
auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, colU);
assert(candidate != LUmat->colIndices + i2);
auto kk = candidate - LUmat->colIndices;
double aij[bs*bs];
// copy block to Aij so operations can be done on it without affecting LUmat
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
int jk = Lmat->rowPointers[rowU1]; // points to row rowU1 in L
// if row rowU1 is empty, skip row
if (jk < Lmat->rowPointers[rowU1+1]) {
int colL = Lmat->colIndices[jk];
// only check until block U(i,j) is reached
for (int k = jColStart; k < ij; ++k) {
int rowU2 = Ut->colIndices[k];
while (colL < rowU2) {
++jk; // check next block on row rowU1 of L
colL = Lmat->colIndices[jk];
}
if (colL == rowU2) {
// Aij -= (Lik * Ukj)
blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
}
}
}
// Uij_new = Aij - sum
memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
}
// update L
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
int iRowStart = Lmat->rowPointers[row];
int iRowEnd = Lmat->rowPointers[row + 1];
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = Lmat->colIndices[ij];
// refine Lij element
// search on row 'row', find blockIndex in LUmat of block with same col (j) as Lij
// LUmat->nnzValues[kk] is block Aij
int i1 = LUmat->rowPointers[row];
int i2 = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, j);
assert(candidate != LUmat->colIndices + i2);
auto kk = candidate - LUmat->colIndices;
double aij[bs*bs];
// copy block to Aij so operations can be done on it without affecting LUmat
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
int jk = Ut->rowPointers[j]; // actually colPointers, jk points to col j in U
int rowU = Ut->colIndices[jk]; // actually rowIndices, rowU is the row of block jk
// only check until block L(i,j) is reached
for (int k = iRowStart; k < ij; ++k) {
int colL = Lmat->colIndices[k];
while (rowU < colL) {
++jk; // check next block on col j of U
rowU = Ut->colIndices[jk];
}
if (rowU == colL) {
// Aij -= (Lik * Ukj)
blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
}
}
// calculate (Aij - sum) / Ujj
double ujj[bs*bs];
inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
// Lij_new = (Aij - sum) / Ujj
blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
}
}
// 1st sweep writes to Ltmp
// 2nd sweep writes to Lmat->nnzValues
std::swap(Lmat->nnzValues, Ltmp);
std::swap(Ut->nnzValues, Utmp);
} // end sweep
// if number of sweeps is even, swap again so data is in Lmat->nnzValues
if (num_sweeps % 2 == 0) {
std::swap(Lmat->nnzValues, Ltmp);
std::swap(Ut->nnzValues, Utmp);
}
delete[] Ltmp;
#endif
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
Timer t_postprocessing;
// convert Ut to BSR
// diagonal stored separately
std::vector<int> ptr(Nb+1, 0);
std::vector<int> cols(Ut->rowPointers[Nb]);
// count blocks per row for U (BSR)
// store diagonal in invDiagVals
for(int i = 0; i < Nb; ++i) {
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
int j = Ut->colIndices[k];
if (j == i) {
inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
} else {
++ptr[j+1];
}
}
}
// prefix sum
std::partial_sum(ptr.begin(), ptr.end(), ptr.begin());
// actually copy nonzero values for U
for(int i = 0; i < Nb; ++i) {
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
int j = Ut->colIndices[k];
if (j != i) {
int head = ptr[j]++;
cols[head] = i;
memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
}
}
}
// the ptr[] were increased in the last loop
std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
ptr.front() = 0;
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel postprocessing: " << t_postprocessing.stop() << " s";
OpmLog::info(out.str());
}
Timer t_copyToGpu;
events.resize(3);
queue->enqueueWriteBuffer(s.Lvals, CL_FALSE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues, nullptr, &events[0]);
queue->enqueueWriteBuffer(s.Uvals, CL_FALSE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp, nullptr, &events[1]);
queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals, nullptr, &events[2]);
std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(8);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
queue->enqueueWriteBuffer(s.Urows, CL_FALSE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data(), nullptr, &events[7]);
});
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel copy to GPU: " << t_copyToGpu.stop() << " s\n";
out << "BILU0 ChowPatel total: " << t_total.stop() << " s";
OpmLog::info(out.str());
}
delete[] Utmp;
#endif // CHOW_PATEL
}
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
@ -533,7 +195,12 @@ BILU0<block_size>::~BILU0()
}
#if CHOW_PATEL
chow_patel_decomposition();
chowPatelIlu.decomposition(queue, context,
LUmat.get(), Lmat.get(), Umat.get(),
invDiagVals, diagIndex,
s.diagIndex, s.invDiagVals,
s.Lvals, s.Lcols, s.Lrows,
s.Uvals, s.Ucols, s.Urows);
#else
Timer t_copyToGpu;
@ -576,16 +243,10 @@ BILU0<block_size>::~BILU0()
for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color+1];
const unsigned int work_group_size2 = 128;
const unsigned int num_work_groups2 = 1024;
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot
if (verbosity >= 4) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
event = (*ilu_decomp)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
event.wait();
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
}
if (verbosity >= 3) {
@ -601,7 +262,7 @@ BILU0<block_size>::~BILU0()
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested
template <unsigned int block_size>
void BILU0<block_size>::apply(cl::Buffer& y, cl::Buffer& x)
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
{
const double relaxation = 0.9;
cl::Event event;
@ -609,28 +270,24 @@ BILU0<block_size>::~BILU0()
for(int color = 0; color < numColors; ++color){
#if CHOW_PATEL
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#else
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#endif
// event.wait();
}
for(int color = numColors-1; color >= 0; --color){
#if CHOW_PATEL
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#else
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#endif
// event.wait();
}
// apply relaxation
event = (*scale)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), x, relaxation, N);
event.wait();
OpenclKernels::scale(x, relaxation, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream out;
out << "BILU0 apply: " << t_apply.stop() << " s";
OpmLog::info(out.str());
@ -646,42 +303,17 @@ template <unsigned int block_size>
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
this->queue = queue_;
}
template <unsigned int block_size>
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) {
this->work_group_size = work_group_size_;
this->total_work_items = total_work_items_;
this->lmem_per_work_group = lmem_per_work_group_;
}
template <unsigned int block_size>
void BILU0<block_size>::setKernels(
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale_,
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_
){
this->ILU_apply1 = ILU_apply1_;
this->ILU_apply2 = ILU_apply2_;
this->scale = scale_;
this->ilu_decomp = ilu_decomp_;
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BILU0<n>::BILU0(ILUReorder, int); \
template BILU0<n>::~BILU0(); \
template bool BILU0<n>::init(BlockedMatrix<n>*); \
template void BILU0<n>::chow_patel_decomposition(); \
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
template void BILU0<n>::setOpenCLContext(cl::Context*); \
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
template void BILU0<n>::setKernels( \
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *, \
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
);
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BILU0<n>::BILU0(ILUReorder, int); \
template BILU0<n>::~BILU0(); \
template bool BILU0<n>::init(BlockedMatrix<n>*); \
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&); \
template void BILU0<n>::setOpenCLContext(cl::Context*); \
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
@ -692,4 +324,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // end namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -29,19 +29,10 @@
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
// the exact decomposition uses a full matrix LU which is the superposition of L and U
// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
#define CHOW_PATEL 0
#define CHOW_PATEL_GPU 1
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This class implementa a Blocked ILU0 preconditioner
@ -83,25 +74,15 @@ namespace bda
#endif
} GPU_storage;
ilu_apply1_kernel_type *ILU_apply1;
ilu_apply2_kernel_type *ILU_apply2;
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale;
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg> *ilu_decomp;
GPU_storage s;
cl::Context *context;
cl::CommandQueue *queue;
std::vector<cl::Event> events;
cl_int err;
int work_group_size = 0;
int total_work_items = 0;
int lmem_per_work_group = 0;
ChowPatelIlu chowPatelIlu;
void chow_patel_decomposition();
#if CHOW_PATEL
ChowPatelIlu<block_size> chowPatelIlu;
#endif
public:
@ -116,17 +97,10 @@ namespace bda
bool create_preconditioner(BlockedMatrix<block_size> *mat);
// apply preconditioner, x = prec(y)
void apply(cl::Buffer& y, cl::Buffer& x);
void apply(const cl::Buffer& y, cl::Buffer& x);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
void setKernels(
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale,
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
);
int* getToOrder()
{
@ -145,7 +119,8 @@ namespace bda
};
} // end namespace bda
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -47,10 +47,10 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
namespace Opm
{
using bda::BdaResult;
using bda::BdaSolver;
using bda::SolverStatus;
using bda::ILUReorder;
using Opm::Accelerator::BdaResult;
using Opm::Accelerator::BdaSolver;
using Opm::Accelerator::SolverStatus;
using Opm::Accelerator::ILUReorder;
template <class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_,
@ -65,7 +65,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
if (accelerator_mode.compare("cusparse") == 0) {
#if HAVE_CUDA
use_gpu = true;
backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
#else
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
#endif
@ -74,17 +74,17 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
use_gpu = true;
ILUReorder ilu_reorder;
if (opencl_ilu_reorder == "") {
ilu_reorder = bda::ILUReorder::GRAPH_COLORING; // default when not selected by user
ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING; // default when not selected by user
} else if (opencl_ilu_reorder == "level_scheduling") {
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING;
} else if (opencl_ilu_reorder == "graph_coloring") {
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING;
} else if (opencl_ilu_reorder == "none") {
ilu_reorder = bda::ILUReorder::NONE;
ilu_reorder = Opm::Accelerator::ILUReorder::NONE;
} else {
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
}
backend.reset(new bda::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
@ -93,22 +93,22 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
use_fpga = true;
ILUReorder ilu_reorder;
if (opencl_ilu_reorder == "") {
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; // default when not selected by user
ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING; // default when not selected by user
} else if (opencl_ilu_reorder == "level_scheduling") {
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING;
} else if (opencl_ilu_reorder == "graph_coloring") {
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING;
} else {
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
}
backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
backend.reset(new Opm::Accelerator::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
#else
OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
#endif
} else if (accelerator_mode.compare("amgcl") == 0) {
#if HAVE_AMGCL
use_gpu = true; // should be replaced by a 'use_bridge' boolean
backend.reset(new bda::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
backend.reset(new Opm::Accelerator::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
#else
OPM_THROW(std::logic_error, "Error amgclSolver was chosen, but amgcl was not found by CMake");
#endif
@ -269,7 +269,7 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs) {
if(accelerator_mode.compare("opencl") == 0){
#if HAVE_OPENCL
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");

View File

@ -38,7 +38,7 @@ namespace Opm
{
typedef Dune::InverseOperatorResult InverseOperatorResult;
using bda::ILUReorder;
using Opm::Accelerator::ILUReorder;
/// BdaBridge acts as interface between opm-simulators with the BdaSolvers
template <class BridgeMatrix, class BridgeVector, int block_size>
@ -49,7 +49,7 @@ private:
bool use_gpu = false;
bool use_fpga = false;
std::string accelerator_mode;
std::unique_ptr<bda::BdaSolver<block_size> > backend;
std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
public:
/// Construct a BdaBridge

View File

@ -20,7 +20,9 @@
#ifndef BDARESULT_HEADER_INCLUDED
#define BDARESULT_HEADER_INCLUDED
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This class is based on InverseOperatorResult struct from dune/istl/solver.hh
@ -39,6 +41,7 @@ public:
}; // end class BdaResult
}
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -24,7 +24,9 @@
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::WellContributions;
@ -91,6 +93,7 @@ namespace bda
}; // end class BdaSolver
} // end namespace bda
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -28,7 +28,9 @@
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -504,5 +506,5 @@ INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
#endif // HAVE_FPGA
} // end namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -26,7 +26,9 @@
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
@ -164,6 +166,7 @@ void blockVectMult(double *mat, double *vect, double scale, double *resVect, boo
void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag);
#endif
} // end namespace bda
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -18,18 +18,25 @@
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/MatrixBlock.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
namespace bda
#if CHOW_PATEL
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
using Opm::OpmLog;
using Dune::Timer;
// if PARALLEL is 0:
// if CHOW_PATEL_GPU_PARALLEL is 0:
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
// Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
// except when the number of blocks in that row exceeds the number of workitems per workgroup.
@ -41,15 +48,13 @@ namespace bda
// If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
// Notes:
// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
// PARALLEL 1 should be run with at least 32 workitems per workgroup.
// CHOW_PATEL_GPU_PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
// CHOW_PATEL_GPU_PARALLEL 1 should be run with at least 32 workitems per workgroup.
// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
// PARALLEL 0 is generally faster, despite not having parallelization.
// CHOW_PATEL_GPU_PARALLEL 0 is generally faster, despite not having parallelization.
// only 3x3 blocks are supported
#define PARALLEL 0
#if PARALLEL
#if CHOW_PATEL_GPU_PARALLEL
inline const char* chow_patel_ilu_sweep_s = R"(
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
@ -265,7 +270,7 @@ __kernel void chow_patel_ilu_sweep(
}
)";
#else
#else // CHOW_PATEL_GPU_PARALLEL
inline const char* chow_patel_ilu_sweep_s = R"(
@ -468,22 +473,378 @@ __kernel void chow_patel_ilu_sweep(
}
)";
#endif // CHOW_PATEL_GPU_PARALLEL
// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
template <unsigned int block_size>
void ChowPatelIlu<block_size>::decomposition(
cl::CommandQueue *queue, [[maybe_unused]] cl::Context *context,
BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
double *invDiagVals, std::vector<int>& diagIndex,
cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
cl::Buffer& d_Uvals, cl::Buffer& d_Ucols, cl::Buffer& d_Urows)
{
if (block_size != 3) {
OPM_THROW(std::logic_error, "ChowPatelIlu::decomposition only supports block_size = 3");
}
const unsigned int bs = block_size;
const int Nb = LUmat->Nb;
const int nnzbs = LUmat->nnzbs;
int num_sweeps = 6;
// split matrix into L and U
// also convert U into BSC format (Ut)
// Ut stores diagonal for now
// original matrix LUmat is assumed to be symmetric
#ifndef NDEBUG
// verify that matrix is symmetric
for (int i = 0; i < Nb; ++i){
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block (i, j) in this row, check if (j, i) also exists
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
int jRowStart = LUmat->rowPointers[j];
int jRowEnd = LUmat->rowPointers[j + 1];
bool blockFound = false;
// check all blocks on row j
// binary search is possible
for (int ji = jRowStart; ji < jRowEnd; ji++) {
int row = LUmat->colIndices[ji];
if (i == row) {
blockFound = true;
break;
}
}
if (false == blockFound) {
OPM_THROW(std::logic_error, "Error sparsity pattern must be symmetric when using chow_patel_decomposition()");
}
}
}
#endif
Timer t_total, t_preprocessing;
// Ut is actually BSC format
std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
Lmat->rowPointers[0] = 0;
for (int i = 0; i < Nb+1; i++) {
Ut->rowPointers[i] = 0;
}
Opm::Detail::Inverter<bs> inverter;
// store inverted diagonal
for (int i = 0; i < Nb; i++) {
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block in this row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
if (i == j) {
inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
}
}
}
// initialize initial guess for L: L_A * D
// L_A is strictly lower triangular part of A
// D is inv(diag(A))
int num_blocks_L = 0;
for (int i = 0; i < Nb; i++) {
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block in this row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
if (i <= j) {
Ut->rowPointers[j+1]++; // actually colPointers, now simply indicates how many blocks this col holds
} else {
Lmat->colIndices[num_blocks_L] = j;
// multiply block of L with corresponding diag block
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
num_blocks_L++;
}
}
// TODO: copy all blocks for L at once, instead of copying each block individually
Lmat->rowPointers[i+1] = num_blocks_L;
}
// prefix sum to sum rowsizes into colpointers
std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
// initialize initial guess for U
for (int i = 0; i < Nb; i++) {
int iRowStart = LUmat->rowPointers[i];
int iRowEnd = LUmat->rowPointers[i + 1];
// for every block in this row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUmat->colIndices[ij];
if (i <= j){
int idx = Ut->rowPointers[j]++; // rowPointers[i] is (mis)used as the write offset of the current row i
Ut->colIndices[idx] = i; // actually rowIndices
memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
}
}
}
// rotate
// the Ut->rowPointers were increased in the last loop
// now Ut->rowPointers[i+1] is at the same position as Ut->rowPointers[i] should have for a crs matrix. reset to correct expected value
for (int i = Nb; i > 0; --i) {
Ut->rowPointers[i] = Ut->rowPointers[i-1];
}
Ut->rowPointers[0] = 0;
// Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition
// U will be reversed because it is used with backwards substitution, the last row is used first
// Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
double *Utmp = new double[Ut->nnzbs * block_size * block_size];
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 ChowPatel preprocessing: " << t_preprocessing.stop() << " s";
OpmLog::info(out.str());
}
// actual ILU decomposition
Timer t_decomposition;
#if CHOW_PATEL_GPU
gpu_decomposition(queue, context,
Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
Nb, num_sweeps);
#else
double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
for (int sweep = 0; sweep < num_sweeps; ++sweep) {
// algorithm
// for every block in A (LUmat):
// if i > j:
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
// else:
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
// for every row
for (int row = 0; row < Nb; row++) {
// update U
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
int jColStart = Ut->rowPointers[row];
int jColEnd = Ut->rowPointers[row + 1];
int colU = row; // rename for clarity, next row in Ut means next col in U
// for every block in this row
for (int ij = jColStart; ij < jColEnd; ij++) {
int rowU1 = Ut->colIndices[ij]; // actually rowIndices for U
// refine Uij element (or diagonal)
int i1 = LUmat->rowPointers[rowU1];
int i2 = LUmat->rowPointers[rowU1+1];
// search on row rowU1, find blockIndex in LUmat of block with same col (colU) as Uij
// LUmat->nnzValues[kk] is block Aij
auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, colU);
assert(candidate != LUmat->colIndices + i2);
auto kk = candidate - LUmat->colIndices;
double aij[bs*bs];
// copy block to Aij so operations can be done on it without affecting LUmat
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
int jk = Lmat->rowPointers[rowU1]; // points to row rowU1 in L
// if row rowU1 is empty, skip row
if (jk < Lmat->rowPointers[rowU1+1]) {
int colL = Lmat->colIndices[jk];
// only check until block U(i,j) is reached
for (int k = jColStart; k < ij; ++k) {
int rowU2 = Ut->colIndices[k];
while (colL < rowU2) {
++jk; // check next block on row rowU1 of L
colL = Lmat->colIndices[jk];
}
if (colL == rowU2) {
// Aij -= (Lik * Ukj)
blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
}
}
}
// Uij_new = Aij - sum
memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
}
// update L
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
int iRowStart = Lmat->rowPointers[row];
int iRowEnd = Lmat->rowPointers[row + 1];
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = Lmat->colIndices[ij];
// refine Lij element
// search on row 'row', find blockIndex in LUmat of block with same col (j) as Lij
// LUmat->nnzValues[kk] is block Aij
int i1 = LUmat->rowPointers[row];
int i2 = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, j);
assert(candidate != LUmat->colIndices + i2);
auto kk = candidate - LUmat->colIndices;
double aij[bs*bs];
// copy block to Aij so operations can be done on it without affecting LUmat
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
int jk = Ut->rowPointers[j]; // actually colPointers, jk points to col j in U
int rowU = Ut->colIndices[jk]; // actually rowIndices, rowU is the row of block jk
// only check until block L(i,j) is reached
for (int k = iRowStart; k < ij; ++k) {
int colL = Lmat->colIndices[k];
while (rowU < colL) {
++jk; // check next block on col j of U
rowU = Ut->colIndices[jk];
}
if (rowU == colL) {
// Aij -= (Lik * Ukj)
blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
}
}
// calculate (Aij - sum) / Ujj
double ujj[bs*bs];
inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
// Lij_new = (Aij - sum) / Ujj
blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
}
}
// 1st sweep writes to Ltmp
// 2nd sweep writes to Lmat->nnzValues
std::swap(Lmat->nnzValues, Ltmp);
std::swap(Ut->nnzValues, Utmp);
} // end sweep
// if number of sweeps is even, swap again so data is in Lmat->nnzValues
if (num_sweeps % 2 == 0) {
std::swap(Lmat->nnzValues, Ltmp);
std::swap(Ut->nnzValues, Utmp);
}
delete[] Ltmp;
#endif
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
Timer t_postprocessing;
// convert Ut to BSR
// diagonal stored separately
std::vector<int> ptr(Nb+1, 0);
std::vector<int> cols(Ut->rowPointers[Nb]);
// count blocks per row for U (BSR)
// store diagonal in invDiagVals
for(int i = 0; i < Nb; ++i) {
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
int j = Ut->colIndices[k];
if (j == i) {
inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
} else {
++ptr[j+1];
}
}
}
// prefix sum
std::partial_sum(ptr.begin(), ptr.end(), ptr.begin());
// actually copy nonzero values for U
for(int i = 0; i < Nb; ++i) {
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
int j = Ut->colIndices[k];
if (j != i) {
int head = ptr[j]++;
cols[head] = i;
memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
}
}
}
// the ptr[] were increased in the last loop
std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
ptr.front() = 0;
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel postprocessing: " << t_postprocessing.stop() << " s";
OpmLog::info(out.str());
}
Timer t_copyToGpu;
events.resize(3);
err = queue->enqueueWriteBuffer(d_Lvals, CL_FALSE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues, nullptr, &events[0]);
err |= queue->enqueueWriteBuffer(d_Uvals, CL_FALSE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_invDiagVals, CL_FALSE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals, nullptr, &events[2]);
std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(8);
err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
err |= queue->enqueueWriteBuffer(d_Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
err |= queue->enqueueWriteBuffer(d_Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
err |= queue->enqueueWriteBuffer(d_Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
err |= queue->enqueueWriteBuffer(d_Urows, CL_FALSE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data(), nullptr, &events[7]);
});
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 ChowPatel copy to GPU: " << t_copyToGpu.stop() << " s\n";
out << "BILU0 ChowPatel total: " << t_total.stop() << " s";
OpmLog::info(out.str());
}
delete[] Utmp;
}
void ChowPatelIlu::decomposition(
template <unsigned int block_size>
void ChowPatelIlu<block_size>::gpu_decomposition(
cl::CommandQueue *queue, cl::Context *context,
int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
int Nb, int num_sweeps, int verbosity)
int Nb, int num_sweeps)
{
const int block_size = 3;
if (block_size != 3) {
OPM_THROW(std::logic_error, "ChowPatelIlu::gpu_decomposition only supports block_size = 3");
}
try {
// just put everything in the capture list
@ -535,7 +896,7 @@ void ChowPatelIlu::decomposition(
OpmLog::info(out.str());
}
std::ostringstream out;
out << "ChowPatelIlu PARALLEL: " << PARALLEL;
out << "ChowPatelIlu PARALLEL: " << CHOW_PATEL_GPU_PARALLEL;
OpmLog::info(out.str());
});
@ -569,7 +930,7 @@ void ChowPatelIlu::decomposition(
auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
int num_work_groups = Nb;
#if PARALLEL
#if CHOW_PATEL_GPU_PARALLEL
int work_group_size = 32;
#else
int work_group_size = 16;
@ -625,5 +986,26 @@ void ChowPatelIlu::decomposition(
}
} // end namespace bda
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template void ChowPatelIlu<n>::decomposition( \
cl::CommandQueue *queue, cl::Context *context, \
BlockedMatrix<n> *LUmat, BlockedMatrix<n> *Lmat, BlockedMatrix<n> *Umat, \
double *invDiagVals, std::vector<int>& diagIndex, \
cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals, \
cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows, \
cl::Buffer& d_Uvals, cl::Buffer& d_Ucols, cl::Buffer& d_Urows);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
#endif // CHOW_PATEL

View File

@ -24,64 +24,106 @@
#include <mutex>
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
// Variables CHOW_PATEL, CHOW_PATEL_GPU and CHOW_PATEL_GPU_PARALLEL are set by CMake
// Pass -DUSE_CHOW_PATEL_ILU=1 to cmake to define CHOW_PATEL and use the iterative ILU decomposition
// Pass -DUSE_CHOW_PATEL_ILU_GPU=1 to run the ILU decomposition sweeps on the GPU
// Pass -DUSE_CHOW_PATEL_ILU_GPU_PARALLEL=1 to use more parallelisation in the GPU kernel, see ChowPatelIlu.cpp
namespace bda
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
// if CHOW_PATEL_GPU is 1, the decomposition is done by gpu_decomposition() on GPU
// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
// the exact decomposition uses a full matrix LU which is the superposition of L and U
// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
#if CHOW_PATEL
namespace Opm
{
namespace Accelerator
{
// This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015:
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
// only blocksize == 3 is supported
// decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
class ChowPatelIlu
{
private:
cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
cl::Buffer d_Ut_ptrs, d_Ut_idxs;
cl::Buffer d_L_rows, d_L_cols;
cl::Buffer d_LU_rows, d_LU_cols;
cl::Buffer d_Ltmp, d_Utmp;
cl::Event event;
std::vector<cl::Event> events;
cl_int err;
std::once_flag initialize_flag;
// This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015:
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
// only blocksize == 3 is supported
// decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
template <unsigned int block_size>
class ChowPatelIlu
{
private:
cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
cl::Buffer d_Ut_ptrs, d_Ut_idxs;
cl::Buffer d_L_rows, d_L_cols;
cl::Buffer d_LU_rows, d_LU_cols;
cl::Buffer d_Ltmp, d_Utmp;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > chow_patel_ilu_sweep_k;
cl::Event event;
std::vector<cl::Event> events;
cl_int err;
std::once_flag initialize_flag;
std::once_flag pattern_uploaded;
int verbosity = 0;
public:
/// Executes the ChowPatelIlu sweeps
/// also copies data from CPU to GPU and GPU to CPU
/// \param[in] queue OpenCL commandqueue
/// \param[in] context OpenCL context
/// \param[in] Ut_ptrs BSC columnpointers
/// \param[in] Ut_idxs BSC rowindices
/// \param[inout] Ut_vals actual nonzeros for U
/// \param[in] Ut_nnzbs number of blocks in U
/// \param[in] L_rows BSR rowpointers
/// \param[in] L_cols BSR columnindices
/// \param[inout] L_vals actual nonzeroes for L
/// \param[in] L_nnzbs number of blocks in L
/// \param[in] LU_rows BSR rowpointers
/// \param[in] LU_cols BSR columnindices
/// \param[in] LU_vals actual nonzeroes for LU (original matrix)
/// \param[in] LU_nnzbs number of blocks in LU
/// \param[in] Nb number of blockrows
/// \param[in] num_sweeps number of sweeps to be done
/// \param[in] verbosity print verbosity
void decomposition(
cl::CommandQueue *queue, cl::Context *context,
int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
int Nb, int num_sweeps, int verbosity);
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > chow_patel_ilu_sweep_k;
};
public:
/// Transposes the U matrix
/// Executes the ChowPatelIlu sweeps for decomposition on CPU
/// Also uploads the decomposition to the GPU (and sparsity pattern if needed)
/// This function calls gpu_decomposition() if CHOW_PATEL_GPU is set
void decomposition(
cl::CommandQueue *queue, cl::Context *context,
BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
double *invDiagVals, std::vector<int>& diagIndex,
cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
cl::Buffer& d_Uvals, cl::Buffer& d_Ucols, cl::Buffer& d_Urows);
} // end namespace bda
/// Executes the ChowPatelIlu sweeps for decomposition on GPU
/// also copies data from CPU to GPU and GPU to CPU
/// \param[in] queue OpenCL commandqueue
/// \param[in] context OpenCL context
/// \param[in] Ut_ptrs BSC columnpointers
/// \param[in] Ut_idxs BSC rowindices
/// \param[inout] Ut_vals actual nonzeros for U
/// \param[in] Ut_nnzbs number of blocks in U
/// \param[in] L_rows BSR rowpointers
/// \param[in] L_cols BSR columnindices
/// \param[inout] L_vals actual nonzeroes for L
/// \param[in] L_nnzbs number of blocks in L
/// \param[in] LU_rows BSR rowpointers
/// \param[in] LU_cols BSR columnindices
/// \param[in] LU_vals actual nonzeroes for LU (original matrix)
/// \param[in] LU_nnzbs number of blocks in LU
/// \param[in] Nb number of blockrows
/// \param[in] num_sweeps number of sweeps to be done
void gpu_decomposition(
cl::CommandQueue *queue, cl::Context *context,
int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
int Nb, int num_sweeps);
/// Set the verbosity
void setVerbosity(int verbosity_) {
this->verbosity = verbosity_;
}
};
} // namespace Accelerator
} // namespace Opm
#endif // CHOW_PATEL
#endif // CHOW_PATEL_ILU_HEADER_INCLUDED

View File

@ -29,7 +29,9 @@
#include <opm/simulators/linalg/bda/Reorder.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -412,4 +414,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -25,7 +25,9 @@
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
/*
@ -112,6 +114,7 @@ public:
};
} //namespace bda
} // namespace Accelerator
} // namespace Opm
#endif // FPGA_BILU0_HEADER_INCLUDED

View File

@ -23,7 +23,9 @@
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
/*Sort a row of matrix elements from a CSR-format.*/
@ -246,4 +248,5 @@ int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
return 0;
}
} // end namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -22,7 +22,9 @@
#include <vector>
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This struct resembles a csr matrix, only doubles are supported
@ -79,6 +81,7 @@ public:
void sortRow(int *colIndices, double *data, int left, int right);
} // end namespace bda
} // namespace Accelerator
} // namespace Opm
#endif // FPGA_MATRIX_HEADER_INCLUDED

View File

@ -39,7 +39,9 @@
//#define FPGA_STATISTICS_FILE_ENABLED
#undef FPGA_STATISTICS_FILE_ENABLED
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -731,4 +733,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -27,7 +27,9 @@
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This class implements an ilu0-bicgstab solver on FPGA
@ -259,7 +261,8 @@ public:
}; // end class fpgaSolverBackend
} //namespace bda
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -20,7 +20,9 @@
#include <sys/time.h>
#include <fstream>
namespace bda
namespace Opm
{
namespace Accelerator
{
double second(void)
@ -60,4 +62,5 @@ bool fileExists(const char *filename)
}
}
} //namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -20,7 +20,9 @@
#ifndef FPGA_UTILS_HEADER_INCLUDED
#define FPGA_UTILS_HEADER_INCLUDED
namespace bda
namespace Opm
{
namespace Accelerator
{
union double2int
@ -34,6 +36,7 @@ bool even(int n);
int roundUpTo(int i, int n);
bool fileExists(const char *filename);
} // end namespace bda
} // namespace Accelerator
} // namespace Opm
#endif // FPGA_UTILS_HEADER_INCLUDED

View File

@ -20,7 +20,9 @@
#ifndef ILUREORDER_HEADER_INCLUDED
#define ILUREORDER_HEADER_INCLUDED
namespace bda
namespace Opm
{
namespace Accelerator
{
// Level Scheduling respects the dependencies in the original matrix, and behaves like Dune and cusparse
// Graph Coloring is more aggresive and is likely to increase the number of linearizations and linear iterations to converge significantly, but can still be faster on GPU because it results in more parallelism
@ -31,6 +33,7 @@ namespace bda
NONE
};
}
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -45,7 +45,9 @@ namespace {
}
namespace bda
namespace Opm
{
namespace Accelerator
{
@ -381,4 +383,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -24,7 +24,9 @@
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
#define MAX_COLORS 256
@ -121,6 +123,7 @@ void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, cons
/// \param[in] Nb number of blockrows in the matrix
void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb);
}
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -23,11 +23,14 @@
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm
{
using Opm::Accelerator::OpenclKernels;
WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){
if(accelerator_mode.compare("cusparse") == 0){
cuda_gpu = true;
@ -82,8 +85,8 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
this->queue = queue_;
}
void WellContributions::setKernel(bda::stdwell_apply_kernel_type *kernel_,
bda::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
void WellContributions::setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}
@ -95,22 +98,13 @@ void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
}
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
if (reorder) {
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
} else {
event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
}
event.wait();
}
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){

View File

@ -94,8 +94,8 @@ private:
#if HAVE_OPENCL
cl::Context *context;
cl::CommandQueue *queue;
bda::stdwell_apply_kernel_type *kernel;
bda::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
Opm::Accelerator::stdwell_apply_kernel_type *kernel;
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
@ -147,8 +147,8 @@ public:
#endif
#if HAVE_OPENCL
void setKernel(bda::stdwell_apply_kernel_type *kernel_,
bda::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect

View File

@ -35,7 +35,9 @@
#include <amgcl/backend/vexcl_static_matrix.hpp>
#endif
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -381,4 +383,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -28,7 +28,9 @@
/// This file is only compiled when both amgcl and CUDA are found by CMake
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -87,5 +89,6 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -38,7 +38,9 @@
#include <amgcl/preconditioner/runtime.hpp>
#include <amgcl/value_type/static_matrix.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This class does not implement a solver, but converts the BCSR format to normal CSR and uses amgcl for solving
@ -149,7 +151,8 @@ public:
}; // end class amgclSolverBackend
} // namespace bda
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -37,7 +37,9 @@
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -511,6 +513,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -28,7 +28,9 @@
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
@ -142,7 +144,8 @@ public:
}; // end class cusparseSolverBackend
} // namespace bda
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -18,12 +18,345 @@
*/
#include <config.h>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <cmath>
#include <sstream>
namespace bda
#include <opm/common/OpmLog/OpmLog.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp> // defines CHOW_PATEL
namespace Opm
{
namespace Accelerator
{
std::string get_axpy_string() {
using Opm::OpmLog;
using Dune::Timer;
// define static variables and kernels
int OpenclKernels::verbosity;
cl::CommandQueue *OpenclKernels::queue;
std::vector<double> OpenclKernels::tmp;
bool OpenclKernels::initialized = false;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
// divide A by B, and round up: return (int)ceil(A/B)
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
{
return A / B + (A % B > 0);
}
void add_kernel_string(cl::Program::Sources &sources, const std::string &source) {
sources.emplace_back(source);
}
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_) {
if (initialized) {
OpmLog::debug("Warning OpenclKernels is already initialized");
return;
}
queue = queue_;
verbosity = verbosity_;
cl::Program::Sources sources;
const std::string& axpy_s = get_axpy_string();
add_kernel_string(sources, axpy_s);
const std::string& scale_s = get_scale_string();
add_kernel_string(sources, scale_s);
const std::string& dot_1_s = get_dot_1_string();
add_kernel_string(sources, dot_1_s);
const std::string& norm_s = get_norm_string();
add_kernel_string(sources, norm_s);
const std::string& custom_s = get_custom_string();
add_kernel_string(sources, custom_s);
const std::string& spmv_blocked_s = get_spmv_blocked_string();
add_kernel_string(sources, spmv_blocked_s);
#if CHOW_PATEL
bool ilu_operate_on_full_matrix = false;
#else
bool ilu_operate_on_full_matrix = true;
#endif
const std::string& ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply1_s);
const std::string& ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply2_s);
const std::string& stdwell_apply_s = get_stdwell_apply_string(true);
add_kernel_string(sources, stdwell_apply_s);
const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
add_kernel_string(sources, stdwell_apply_no_reorder_s);
const std::string& ilu_decomp_s = get_ilu_decomp_string();
add_kernel_string(sources, ilu_decomp_s);
cl::Program program = cl::Program(*context, sources);
program.build(devices);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
// actually creating the kernels
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
initialized = true;
} // end get_opencl_kernels()
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_dot;
tmp.resize(num_work_groups);
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
double gpu_sum = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_sum += tmp[i];
}
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels dot() time: " << t_dot.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_sum;
}
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_norm;
tmp.resize(num_work_groups);
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
double gpu_norm = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_norm += tmp[i];
}
gpu_norm = sqrt(gpu_norm);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels norm() time: " << t_norm.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_norm;
}
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_axpy;
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels axpy() time: " << t_axpy.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_scale;
cl::Event event = (*scale_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels scale() time: " << t_scale.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_custom;
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels custom() time: " << t_custom.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_spmv;
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels spmv_blocked() time: " << t_spmv.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_ilu_apply1;
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, y, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels ILU_apply1() time: " << t_ilu_apply1.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_ilu_apply2;
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, invDiagVals, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels ILU_apply2() time: " << t_ilu_apply2.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size)
{
const unsigned int work_group_size2 = 128;
const unsigned int num_work_groups2 = 1024;
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, vals, cols, rows, invDiagVals, diagIndex, Nb, cl::Local(lmem_per_work_group2));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels ILU_decomp() time: " << t_ilu_decomp.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
{
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
Timer t_apply_stdwells;
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels apply_stdwells() time: " << t_apply_stdwells.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
{
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
Timer t_apply_stdwells;
cl::Event event = (*stdwell_apply_no_reorder_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels apply_stdwells() time: " << t_apply_stdwells.stop() << " s";
OpmLog::info(oss.str());
}
}
std::string OpenclKernels::get_axpy_string() {
return R"(
__kernel void axpy(
__global double *in,
@ -44,7 +377,7 @@ namespace bda
// scale vector with scalar
std::string get_scale_string() {
std::string OpenclKernels::get_scale_string() {
return R"(
__kernel void scale(
__global double *vec,
@ -64,7 +397,7 @@ namespace bda
// returns partial sums, instead of the final dot product
std::string get_dot_1_string() {
std::string OpenclKernels::get_dot_1_string() {
return R"(
__kernel void dot_1(
__global double *in1,
@ -107,7 +440,7 @@ namespace bda
// returns partial sums, instead of the final norm
// the square root must be computed on CPU
std::string get_norm_string() {
std::string OpenclKernels::get_norm_string() {
return R"(
__kernel void norm(
__global double *in,
@ -148,7 +481,7 @@ namespace bda
// p = (p - omega * v) * beta + r
std::string get_custom_string() {
std::string OpenclKernels::get_custom_string() {
return R"(
__kernel void custom(
__global double *p,
@ -173,7 +506,7 @@ namespace bda
)";
}
std::string get_spmv_blocked_string() {
std::string OpenclKernels::get_spmv_blocked_string() {
return R"(
__kernel void spmv_blocked(
__global const double *vals,
@ -243,7 +576,7 @@ namespace bda
std::string get_ILU_apply1_string(bool full_matrix) {
std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply1(
__global const double *LUvals,
@ -319,7 +652,7 @@ namespace bda
}
std::string get_ILU_apply2_string(bool full_matrix) {
std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply2(
__global const double *LUvals,
@ -402,7 +735,7 @@ namespace bda
return s;
}
std::string get_stdwell_apply_string(bool reorder) {
std::string OpenclKernels::get_stdwell_apply_string(bool reorder) {
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
std::string s = "__kernel void " + kernel_name + R"((
__global const double *Cnnzs,
@ -497,7 +830,7 @@ namespace bda
}
std::string get_ilu_decomp_string() {
std::string OpenclKernels::get_ilu_decomp_string() {
return R"(
// a = a - (b * c)
@ -637,5 +970,6 @@ namespace bda
)";
}
} // end namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -21,15 +21,18 @@
#define OPENCL_HPP
#include <string>
#include <memory>
#include <opm/simulators/linalg/bda/opencl.hpp>
namespace bda
namespace Opm
{
namespace Accelerator
{
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
@ -44,57 +47,107 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
class OpenclKernels
{
private:
static int verbosity;
static cl::CommandQueue *queue;
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
/// Generate string with axpy kernel
/// a = a + alpha * b
std::string get_axpy_string();
static std::string get_axpy_string();
/// Generate string with scale kernel
/// a = a * alpha
std::string get_scale_string();
static std::string get_scale_string();
/// returns partial sums, instead of the final dot product
/// partial sums are added on CPU
std::string get_dot_1_string();
static std::string get_dot_1_string();
/// returns partial sums, instead of the final norm
/// the square root must be computed on CPU
std::string get_norm_string();
static std::string get_norm_string();
/// Generate string with custom kernel
/// This kernel combines some ilubicgstab vector operations into 1
/// p = (p - omega * v) * beta + r
std::string get_custom_string();
static std::string get_custom_string();
/// b = mat * x
/// algorithm based on:
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
std::string get_spmv_blocked_string();
static std::string get_spmv_blocked_string();
/// ILU apply part 1: forward substitution
/// solves L*x=y where L is a lower triangular sparse blocked matrix
/// this L can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
std::string get_ILU_apply1_string(bool full_matrix);
static std::string get_ILU_apply1_string(bool full_matrix);
/// ILU apply part 2: backward substitution
/// solves U*x=y where U is an upper triangular sparse blocked matrix
/// this U can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
std::string get_ILU_apply2_string(bool full_matrix);
static std::string get_ILU_apply2_string(bool full_matrix);
/// Generate string with the stdwell_apply kernels
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
/// the x/y vector is reordered, use toOrder to address that
/// \param[in] reorder whether the matrix is reordered or not
std::string get_stdwell_apply_string(bool reorder);
static std::string get_stdwell_apply_string(bool reorder);
/// Generate string with the exact ilu decomposition kernel
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
std::string get_ilu_decomp_string();
static std::string get_ilu_decomp_string();
} // end namespace bda
OpenclKernels(){}; // disable instantiation
public:
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const double a, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
static void apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
};
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -35,7 +35,9 @@
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0
namespace bda
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
@ -174,6 +176,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
context = std::make_shared<cl::Context>(devices[0]);
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -193,125 +197,6 @@ openclSolverBackend<block_size>::~openclSolverBackend() {
}
// divide A by B, and round up: return (int)ceil(A/B)
template <unsigned int block_size>
unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
{
return A / B + (A % B > 0);
}
template <unsigned int block_size>
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_dot;
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
double gpu_sum = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_sum += tmp[i];
}
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_sum;
}
template <unsigned int block_size>
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_norm;
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
double gpu_norm = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_norm += tmp[i];
}
gpu_norm = sqrt(gpu_norm);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_norm;
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_axpy;
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s";
OpmLog::info(oss.str());
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_custom;
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s";
OpmLog::info(oss.str());
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_spmv;
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s";
OpmLog::info(oss.str());
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
@ -319,10 +204,6 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
double norm, norm_0;
if(wellContribs.getNumWells() > 0){
wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
}
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
// set r to the initial residual
@ -348,7 +229,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
}
norm = norm_w(d_r, d_tmp);
norm = OpenclKernels::norm(d_r, d_tmp, N);
norm_0 = norm;
if (verbosity > 1) {
@ -360,11 +241,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_rest.start();
for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho;
rho = dot_w(d_rw, d_r, d_tmp);
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
if (it > 1) {
beta = (rho / rhop) * (alpha / omega);
custom_w(d_p, d_v, d_r, omega, beta);
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
}
t_rest.stop();
@ -375,7 +256,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// v = A * pw
t_spmv.start();
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
t_spmv.stop();
// apply wellContributions
@ -386,11 +267,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_well.stop();
t_rest.start();
tmp1 = dot_w(d_rw, d_v, d_tmp);
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
alpha = rho / tmp1;
axpy_w(d_v, -alpha, d_r); // r = r - alpha * v
axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw
norm = norm_w(d_r, d_tmp);
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels::norm(d_r, d_tmp, N);
t_rest.stop();
if (norm < tolerance * norm_0) {
@ -406,7 +287,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// t = A * s
t_spmv.start();
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
t_spmv.stop();
// apply wellContributions
@ -417,12 +298,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_well.stop();
t_rest.start();
tmp1 = dot_w(d_t, d_r, d_tmp);
tmp2 = dot_w(d_t, d_t, d_tmp);
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
omega = tmp1 / tmp2;
axpy_w(d_s, omega, d_x); // x = x + omega * s
axpy_w(d_t, -omega, d_r); // r = r - omega * t
norm = norm_w(d_r, d_tmp);
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels::norm(d_r, d_tmp, N);
t_rest.stop();
if (norm < tolerance * norm_0) {
@ -479,7 +360,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
tmp = new double[N];
#if COPY_ROW_BY_ROW
vals_contiguous = new double[N];
#endif
@ -507,10 +387,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
get_opencl_kernels();
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), scale_k.get(), ilu_decomp_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -525,68 +401,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
initialized = true;
} // end initialize()
void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
sources.emplace_back(source);
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::get_opencl_kernels() {
cl::Program::Sources sources;
std::string axpy_s = get_axpy_string();
add_kernel_string(sources, axpy_s);
std::string scale_s = get_scale_string();
add_kernel_string(sources, scale_s);
std::string dot_1_s = get_dot_1_string();
add_kernel_string(sources, dot_1_s);
std::string norm_s = get_norm_string();
add_kernel_string(sources, norm_s);
std::string custom_s = get_custom_string();
add_kernel_string(sources, custom_s);
std::string spmv_blocked_s = get_spmv_blocked_string();
add_kernel_string(sources, spmv_blocked_s);
#if CHOW_PATEL
bool ilu_operate_on_full_matrix = false;
#else
bool ilu_operate_on_full_matrix = true;
#endif
std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply1_s);
std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply2_s);
std::string stdwell_apply_s = get_stdwell_apply_string(true);
add_kernel_string(sources, stdwell_apply_s);
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
add_kernel_string(sources, stdwell_apply_no_reorder_s);
std::string ilu_decomp_s = get_ilu_decomp_string();
add_kernel_string(sources, ilu_decomp_s);
cl::Program program = cl::Program(*context, sources);
program.build(devices);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
// actually creating the kernels
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
} // end get_opencl_kernels()
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] rb;
}
delete[] tmp;
#if COPY_ROW_BY_ROW
delete[] vals_contiguous;
#endif
@ -672,11 +492,6 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
Timer t;
bool success = prec->init(mat.get());
int work_group_size = 32;
int num_work_groups = ceilDivision(N, work_group_size);
int total_work_items = num_work_groups * work_group_size;
int lmem_per_work_group = work_group_size * sizeof(double);
prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
if (opencl_ilu_reorder == ILUReorder::NONE) {
rmat = mat.get();
@ -820,4 +635,5 @@ INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda
} // namespace Accelerator
} // namespace Opm

View File

@ -30,7 +30,9 @@
#include <tuple>
namespace bda
namespace Opm
{
namespace Accelerator
{
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
@ -61,21 +63,8 @@ private:
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
cl::Buffer d_toOrder; // only used when reordering is used
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
// shared pointers are also passed to other objects
std::vector<cl::Device> devices;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
std::shared_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
std::shared_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
std::shared_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
std::shared_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
@ -150,9 +139,6 @@ private:
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
/// Generate and compile opencl kernels
void get_opencl_kernels();
/// Clean memory
void finalize();
@ -220,7 +206,8 @@ public:
}; // end class openclSolverBackend
} // namespace bda
} // namespace Accelerator
} // namespace Opm
#endif