mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Exact ILU decomp is now performed on GPU
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/simulators/linalg/MatrixBlock.hpp>
|
||||
@@ -32,14 +33,6 @@
|
||||
namespace bda
|
||||
{
|
||||
|
||||
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
|
||||
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
|
||||
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
|
||||
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
|
||||
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
|
||||
#define CHOW_PATEL 0
|
||||
#define CHOW_PATEL_GPU 1
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
@@ -126,41 +119,46 @@ namespace bda
|
||||
diagIndex = new int[mat->Nb];
|
||||
invDiagVals = new double[mat->Nb * bs * bs];
|
||||
|
||||
#if CHOW_PATEL
|
||||
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
#endif
|
||||
|
||||
LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
|
||||
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs);
|
||||
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1));
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
|
||||
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
|
||||
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
|
||||
#if CHOW_PATEL
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
#else
|
||||
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
|
||||
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
|
||||
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
|
||||
#endif
|
||||
|
||||
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
|
||||
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
|
||||
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals);
|
||||
|
||||
int *rowsPerColorPrefix = new int[numColors + 1];
|
||||
rowsPerColorPrefix[0] = 0;
|
||||
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
|
||||
for (int i = 0; i < numColors; ++i) {
|
||||
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
}
|
||||
queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
|
||||
delete[] rowsPerColorPrefix;
|
||||
queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data());
|
||||
|
||||
return true;
|
||||
} // end init()
|
||||
|
||||
|
||||
// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::chow_patel_decomposition()
|
||||
{
|
||||
#if CHOW_PATEL
|
||||
const unsigned int bs = block_size;
|
||||
int num_sweeps = 6;
|
||||
|
||||
@@ -399,7 +397,7 @@ namespace bda
|
||||
// convert Ut to BSR
|
||||
// diagonal stored separately
|
||||
std::vector<int> ptr(Nb+1, 0);
|
||||
std::vector<int> col(Ut->rowPointers[Nb]);
|
||||
std::vector<int> cols(Ut->rowPointers[Nb]);
|
||||
|
||||
// count blocks per row for U (BSR)
|
||||
// store diagonal in invDiagVals
|
||||
@@ -423,7 +421,7 @@ namespace bda
|
||||
int j = Ut->colIndices[k];
|
||||
if (j != i) {
|
||||
int head = ptr[j]++;
|
||||
col[head] = i;
|
||||
cols[head] = i;
|
||||
memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
|
||||
}
|
||||
}
|
||||
@@ -433,21 +431,39 @@ namespace bda
|
||||
std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
|
||||
ptr.front() = 0;
|
||||
|
||||
// reversing the rows of U, because that is the order they are used in during ILU apply
|
||||
int URowIndex = 0;
|
||||
int offsetU = 0; // number of nnz blocks that are already copied to Umat
|
||||
Umat->rowPointers[0] = 0;
|
||||
for (int i = LUmat->Nb - 1; i >= 0; i--) {
|
||||
int rowSize = ptr[i + 1] - ptr[i]; // number of blocks in this row
|
||||
memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
|
||||
memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
|
||||
offsetU += rowSize;
|
||||
Umat->rowPointers[URowIndex + 1] = offsetU;
|
||||
URowIndex++;
|
||||
}
|
||||
std::call_once(pattern_uploaded, [&](){
|
||||
// find the positions of each diagonal block
|
||||
// must be done after reordering
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int rowStart = LUmat->rowPointers[row];
|
||||
int rowEnd = LUmat->rowPointers[row+1];
|
||||
bool diagFound = false;
|
||||
for (int ij = rowStart; ij < rowEnd; ++ij) {
|
||||
int col = LUmat->colIndices[ij];
|
||||
if (row == col) {
|
||||
diagIndex[row] = ij;
|
||||
diagFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!diagFound) {
|
||||
OPM_THROW(std::logic_error, "Error did not find diagonal block in reordered matrix");
|
||||
}
|
||||
}
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_TRUE, 0, Nb * sizeof(int), diagIndex);
|
||||
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
|
||||
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), cols.data());
|
||||
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data());
|
||||
});
|
||||
|
||||
|
||||
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp);
|
||||
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals);
|
||||
|
||||
delete[] Utmp;
|
||||
|
||||
#endif // CHOW_PATEL
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
@@ -479,118 +495,66 @@ namespace bda
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Timer t_decomposition;
|
||||
#if CHOW_PATEL
|
||||
chow_patel_decomposition();
|
||||
#else
|
||||
int i, j, ij, ik, jk;
|
||||
int iRowStart, iRowEnd, jRowEnd;
|
||||
double pivot[bs * bs];
|
||||
Timer t_copyToGpu;
|
||||
|
||||
int LSize = 0;
|
||||
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
|
||||
queue->enqueueWriteBuffer(s.LUvals, CL_TRUE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues);
|
||||
|
||||
// go through all rows
|
||||
for (i = 0; i < LUmat->Nb; i++) {
|
||||
iRowStart = LUmat->rowPointers[i];
|
||||
iRowEnd = LUmat->rowPointers[i + 1];
|
||||
|
||||
// go through all elements of the row
|
||||
for (ij = iRowStart; ij < iRowEnd; ij++) {
|
||||
j = LUmat->colIndices[ij];
|
||||
// if the element is the diagonal, store the index and go to next row
|
||||
if (j == i) {
|
||||
diagIndex[i] = ij;
|
||||
break;
|
||||
}
|
||||
// if an element beyond the diagonal is reach, no diagonal was found
|
||||
// throw an error now. TODO: perform reordering earlier to prevent this
|
||||
if (j > i) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 Error could not find diagonal value in row: " << i;
|
||||
OpmLog::error(out.str());
|
||||
return false;
|
||||
}
|
||||
|
||||
LSize++;
|
||||
// calculate the pivot of this row
|
||||
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
|
||||
|
||||
memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size);
|
||||
|
||||
jRowEnd = LUmat->rowPointers[j + 1];
|
||||
jk = diagIndex[j] + 1;
|
||||
ik = ij + 1;
|
||||
// substract that row scaled by the pivot from this row.
|
||||
while (ik < iRowEnd && jk < jRowEnd) {
|
||||
if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) {
|
||||
blockMultSub<bs>(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs);
|
||||
ik++;
|
||||
jk++;
|
||||
} else {
|
||||
if (LUmat->colIndices[ik] < LUmat->colIndices[jk])
|
||||
{ ik++; }
|
||||
else
|
||||
{ jk++; }
|
||||
std::call_once(pattern_uploaded, [&](){
|
||||
// find the positions of each diagonal block
|
||||
// must be done after reordering
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int rowStart = LUmat->rowPointers[row];
|
||||
int rowEnd = LUmat->rowPointers[row+1];
|
||||
bool diagFound = false;
|
||||
for (int ij = rowStart; ij < rowEnd; ++ij) {
|
||||
int col = LUmat->colIndices[ij];
|
||||
if (row == col) {
|
||||
diagIndex[row] = ij;
|
||||
diagFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!diagFound) {
|
||||
OPM_THROW(std::logic_error, "Error did not find diagonal block in reordered matrix");
|
||||
}
|
||||
}
|
||||
// store the inverse in the diagonal!
|
||||
inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_TRUE, 0, Nb * sizeof(int), diagIndex);
|
||||
queue->enqueueWriteBuffer(s.LUcols, CL_TRUE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.LUrows, CL_TRUE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers);
|
||||
});
|
||||
|
||||
memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
|
||||
}
|
||||
|
||||
Lmat->rowPointers[0] = 0;
|
||||
Umat->rowPointers[0] = 0;
|
||||
|
||||
// Split the LU matrix into two by comparing column indices to diagonal indices
|
||||
for (i = 0; i < LUmat->Nb; i++) {
|
||||
int offsetL = Lmat->rowPointers[i];
|
||||
int rowSize = diagIndex[i] - LUmat->rowPointers[i];
|
||||
int offsetLU = LUmat->rowPointers[i];
|
||||
memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
|
||||
memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
|
||||
offsetL += rowSize;
|
||||
Lmat->rowPointers[i + 1] = offsetL;
|
||||
}
|
||||
// Reverse the order or the (blocked) rows for the U matrix,
|
||||
// because the rows are accessed in reverse order when applying the ILU0
|
||||
int URowIndex = 0;
|
||||
for (i = LUmat->Nb - 1; i >= 0; i--) {
|
||||
int offsetU = Umat->rowPointers[URowIndex];
|
||||
int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1;
|
||||
int offsetLU = diagIndex[i] + 1;
|
||||
memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
|
||||
memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
|
||||
offsetU += rowSize;
|
||||
Umat->rowPointers[URowIndex + 1] = offsetU;
|
||||
URowIndex++;
|
||||
}
|
||||
#endif
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Timer t_copyToGpu;
|
||||
if (pattern_uploaded == false) {
|
||||
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
|
||||
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
|
||||
pattern_uploaded = true;
|
||||
}
|
||||
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Timer t_decomposition;
|
||||
std::ostringstream out;
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
const unsigned int firstRow = rowsPerColorPrefix[color];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color+1];
|
||||
const unsigned int work_group_size2 = 128;
|
||||
const unsigned int num_work_groups2 = 1024;
|
||||
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
|
||||
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
|
||||
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot
|
||||
if (verbosity >= 4) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
|
||||
event.wait();
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
#endif // CHOW_PATEL
|
||||
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
@@ -604,16 +568,24 @@ namespace bda
|
||||
Timer t_apply;
|
||||
|
||||
for(int color = 0; color < numColors; ++color){
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#if CHOW_PATEL
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#else
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#endif
|
||||
// event.wait();
|
||||
}
|
||||
|
||||
for(int color = numColors-1; color >= 0; --color){
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#if CHOW_PATEL
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#else
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#endif
|
||||
// event.wait();
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 apply: " << t_apply.stop() << " s";
|
||||
@@ -638,11 +610,13 @@ namespace bda
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernels(
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
|
||||
){
|
||||
this->ILU_apply1 = ILU_apply1_;
|
||||
this->ILU_apply2 = ILU_apply2_;
|
||||
this->ilu_decomp_k = ilu_decomp_k_;
|
||||
}
|
||||
|
||||
|
||||
@@ -657,8 +631,9 @@ template void BILU0<n>::setOpenCLContext(cl::Context*);
|
||||
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
|
||||
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
|
||||
template void BILU0<n>::setKernels( \
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> * \
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
|
||||
);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
|
||||
@@ -20,12 +20,26 @@
|
||||
#ifndef BILU0_HPP
|
||||
#define BILU0_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
|
||||
|
||||
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
|
||||
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
|
||||
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
|
||||
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
|
||||
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
|
||||
// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
|
||||
// the exact decomposition uses a full matrix LU which is the superposition of L and U
|
||||
// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
|
||||
#define CHOW_PATEL 0
|
||||
#define CHOW_PATEL_GPU 1
|
||||
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
@@ -40,33 +54,46 @@ namespace bda
|
||||
int Nb; // number of blockrows of the matrix
|
||||
int nnz; // number of nonzeroes of the matrix (scalar)
|
||||
int nnzbs; // number of blocks of the matrix
|
||||
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr, LUmat = nullptr;
|
||||
std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
|
||||
#if CHOW_PATEL
|
||||
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
|
||||
#endif
|
||||
double *invDiagVals = nullptr;
|
||||
int *diagIndex = nullptr;
|
||||
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
||||
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
||||
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
|
||||
int *toOrder = nullptr, *fromOrder = nullptr;
|
||||
int numColors;
|
||||
int verbosity;
|
||||
std::once_flag pattern_uploaded;
|
||||
|
||||
ILUReorder opencl_ilu_reorder;
|
||||
|
||||
typedef struct {
|
||||
cl::Buffer Lvals, Uvals, invDiagVals;
|
||||
cl::Buffer Lcols, Lrows;
|
||||
cl::Buffer Ucols, Urows;
|
||||
cl::Buffer invDiagVals;
|
||||
cl::Buffer diagIndex;
|
||||
cl::Buffer rowsPerColor;
|
||||
#if CHOW_PATEL
|
||||
cl::Buffer Lvals, Lcols, Lrows;
|
||||
cl::Buffer Uvals, Ucols, Urows;
|
||||
#else
|
||||
cl::Buffer LUvals, LUcols, LUrows;
|
||||
#endif
|
||||
} GPU_storage;
|
||||
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&,
|
||||
const int, cl::LocalSpaceArg> *ilu_decomp_k;
|
||||
|
||||
GPU_storage s;
|
||||
cl::Context *context;
|
||||
cl::CommandQueue *queue;
|
||||
int work_group_size = 0;
|
||||
int total_work_items = 0;
|
||||
int lmem_per_work_group = 0;
|
||||
bool pattern_uploaded = false;
|
||||
|
||||
ChowPatelIlu chowPatelIlu;
|
||||
|
||||
@@ -91,8 +118,9 @@ namespace bda
|
||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
||||
void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
|
||||
void setKernels(
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k
|
||||
);
|
||||
|
||||
int* getToOrder()
|
||||
|
||||
@@ -201,11 +201,10 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
|
||||
|
||||
void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
|
||||
int reordered = 0;
|
||||
int i, c;
|
||||
|
||||
// Find reordering patterns
|
||||
for (c = 0; c < numColors; c++) {
|
||||
for (i = 0; i < Nb; i++) {
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
for (int i = 0; i < Nb; i++) {
|
||||
if (colors[i] == c) {
|
||||
rowsPerColor[c]++;
|
||||
toOrder[i] = reordered;
|
||||
@@ -259,10 +258,11 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
|
||||
int activeRowIndex = 0, colorEnd, nextActiveRowIndex = 0;
|
||||
int thisRow;
|
||||
std::vector<bool> doneRows(Nb, false);
|
||||
rowsPerColor.reserve(Nb);
|
||||
|
||||
std::vector <int> rowsToStart;
|
||||
|
||||
// since emplace_back() is used to fill, the vector must be empty
|
||||
assert(rowsPerColor.size() == 0);
|
||||
|
||||
// find starting rows: rows that are independent from all rows that come before them.
|
||||
for (thisRow = 0; thisRow < Nb; thisRow++) {
|
||||
if (canBeStarted(thisRow, CSCColPointers, CSCRowIndices, doneRows)) {
|
||||
@@ -312,8 +312,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
|
||||
|
||||
template <unsigned int block_size>
|
||||
void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
|
||||
std::vector<int> rowColor;
|
||||
rowColor.resize(Nb);
|
||||
std::vector<int> rowColor(Nb);
|
||||
|
||||
*numColors = colorBlockedNodes<block_size>(Nb, CSRRowPointers, CSRColIndices, CSCColPointers, CSCRowIndices, rowColor, maxRowsPerColor, maxColsPerColor);
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIn
|
||||
/// \param[out] numColors a pointer to the number of colors needed for the level scheduling
|
||||
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color
|
||||
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill
|
||||
void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);
|
||||
|
||||
/// Find a graph coloring reordering for an input matrix
|
||||
@@ -105,7 +105,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
|
||||
/// \param[out] numColors the number of colors used in the found graph coloring
|
||||
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color
|
||||
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function will resize()
|
||||
template <unsigned int block_size>
|
||||
void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor);
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
#include <cstring>
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
namespace Opm
|
||||
|
||||
@@ -225,7 +225,7 @@ namespace bda
|
||||
__global const double *Lvals,
|
||||
__global const unsigned int *Lcols,
|
||||
__global const unsigned int *Lrows,
|
||||
const unsigned int LrowSize,
|
||||
__global const int *diagIndex,
|
||||
__global const double *y,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
@@ -250,7 +250,7 @@ namespace bda
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = Lrows[target_block_row];
|
||||
const unsigned int last_block = Lrows[target_block_row+1];
|
||||
const unsigned int last_block = diagIndex[target_block_row];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
@@ -292,10 +292,10 @@ namespace bda
|
||||
// solves U*x=y where L is a lower triangular sparse blocked matrix
|
||||
inline const char* ILU_apply2_s = R"(
|
||||
__kernel void ILU_apply2(
|
||||
__global const double *Uvals,
|
||||
__global const int *Ucols,
|
||||
__global const int *Urows,
|
||||
const unsigned int UrowSize,
|
||||
__global const double *LUvals,
|
||||
__global const int *LUcols,
|
||||
__global const int *LUrows,
|
||||
__global const int *diagIndex,
|
||||
__global const double *invDiagVals,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
@@ -320,8 +320,8 @@ namespace bda
|
||||
const double relaxation = 0.9;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = Urows[UrowSize-target_block_row-1];
|
||||
const unsigned int last_block = Urows[UrowSize-target_block_row];
|
||||
const unsigned int first_block = diagIndex[target_block_row] + 1;
|
||||
const unsigned int last_block = LUrows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
@@ -330,8 +330,8 @@ namespace bda
|
||||
local_out = x[row];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[Ucols[block]*bs + c];
|
||||
const double A_elem = Uvals[block*bs*bs + c + r*bs];
|
||||
const double x_elem = x[LUcols[block]*bs + c];
|
||||
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||
local_out -= x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
@@ -513,6 +513,144 @@ namespace bda
|
||||
}
|
||||
)";
|
||||
|
||||
inline const char* ilu_decomp_s = R"(
|
||||
|
||||
// a = a - (b * c)
|
||||
__kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
|
||||
{
|
||||
const unsigned int block_size = 3;
|
||||
const unsigned int hwarp_size = 16;
|
||||
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
||||
if(thread_id_in_hwarp < block_size * block_size){
|
||||
const unsigned int row = thread_id_in_hwarp / block_size;
|
||||
const unsigned int col = thread_id_in_hwarp % block_size;
|
||||
double temp = 0.0;
|
||||
for (unsigned int k = 0; k < block_size; k++) {
|
||||
temp += b[block_size * row + k] * c[block_size * k + col];
|
||||
}
|
||||
a[block_size * row + col] -= temp;
|
||||
}
|
||||
}
|
||||
|
||||
// c = a * b
|
||||
__kernel void block_mult(__global double *a, __global double *b, __local double *c) {
|
||||
const unsigned int block_size = 3;
|
||||
const unsigned int hwarp_size = 16;
|
||||
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
||||
if(thread_id_in_hwarp < block_size * block_size){
|
||||
const unsigned int row = thread_id_in_hwarp / block_size;
|
||||
const unsigned int col = thread_id_in_hwarp % block_size;
|
||||
double temp = 0.0;
|
||||
for (unsigned int k = 0; k < block_size; k++) {
|
||||
temp += a[block_size * row + k] * b[block_size * k + col];
|
||||
}
|
||||
c[block_size * row + col] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
// invert 3x3 matrix
|
||||
__kernel void inverter(__global double *matrix, __global double *inverse) {
|
||||
const unsigned int block_size = 3;
|
||||
const unsigned int bs = block_size; // rename to shorter name
|
||||
const unsigned int hwarp_size = 16;
|
||||
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
||||
if(thread_id_in_hwarp < bs * bs){
|
||||
double t4 = matrix[0] * matrix[4];
|
||||
double t6 = matrix[0] * matrix[5];
|
||||
double t8 = matrix[1] * matrix[3];
|
||||
double t10 = matrix[2] * matrix[3];
|
||||
double t12 = matrix[1] * matrix[6];
|
||||
double t14 = matrix[2] * matrix[6];
|
||||
|
||||
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
|
||||
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
|
||||
double t17 = 1.0 / det;
|
||||
|
||||
const unsigned int r = thread_id_in_hwarp / bs;
|
||||
const unsigned int c = thread_id_in_hwarp % bs;
|
||||
const unsigned int r1 = (r+1) % bs;
|
||||
const unsigned int c1 = (c+1) % bs;
|
||||
const unsigned int r2 = (r+bs-1) % bs;
|
||||
const unsigned int c2 = (c+bs-1) % bs;
|
||||
inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void ilu_decomp(const unsigned int firstRow,
|
||||
const unsigned int lastRow,
|
||||
__global double *LUvals,
|
||||
__global const int *LUcols,
|
||||
__global const int *LUrows,
|
||||
__global double *invDiagVals,
|
||||
__global int *diagIndex,
|
||||
const unsigned int Nb,
|
||||
__local double *pivot){
|
||||
|
||||
const unsigned int bs = 3;
|
||||
const unsigned int hwarp_size = 16;
|
||||
const unsigned int work_group_size = get_local_size(0);
|
||||
const unsigned int work_group_id = get_group_id(0);
|
||||
const unsigned int num_groups = get_num_groups(0);
|
||||
const unsigned int hwarps_per_group = work_group_size / hwarp_size;
|
||||
const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group
|
||||
const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads)
|
||||
const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
|
||||
const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it
|
||||
// every workitem in a hwarp has the same lmem_offset
|
||||
|
||||
// go through all rows
|
||||
for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
|
||||
{
|
||||
int iRowStart = LUrows[i];
|
||||
int iRowEnd = LUrows[i + 1];
|
||||
|
||||
// go through all elements of the row
|
||||
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||
int j = LUcols[ij];
|
||||
|
||||
if (j < i) {
|
||||
// calculate the pivot of this row
|
||||
block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
|
||||
|
||||
// copy pivot
|
||||
if (thread_id_in_hwarp < bs * bs) {
|
||||
LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
|
||||
}
|
||||
|
||||
int jRowEnd = LUrows[j + 1];
|
||||
int jk = diagIndex[j] + 1;
|
||||
int ik = ij + 1;
|
||||
// substract that row scaled by the pivot from this row.
|
||||
while (ik < iRowEnd && jk < jRowEnd) {
|
||||
if (LUcols[ik] == LUcols[jk]) {
|
||||
block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
|
||||
ik++;
|
||||
jk++;
|
||||
} else {
|
||||
if (LUcols[ik] < LUcols[jk])
|
||||
{ ik++; }
|
||||
else
|
||||
{ jk++; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// store the inverse in the diagonal
|
||||
inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
|
||||
|
||||
// copy inverse
|
||||
if (thread_id_in_hwarp < bs * bs) {
|
||||
LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif
|
||||
|
||||
@@ -484,6 +484,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
||||
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
|
||||
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
|
||||
source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
|
||||
program = cl::Program(*context, source);
|
||||
program.build(devices);
|
||||
|
||||
@@ -527,8 +528,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
|
||||
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
|
||||
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
|
||||
stdwell_apply_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
@@ -537,8 +538,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
||||
ilu_decomp_k.reset(new cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const int, cl::LocalSpaceArg>(cl::Kernel(program, "ilu_decomp")));
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
|
||||
@@ -70,8 +70,8 @@ private:
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
@@ -80,6 +80,8 @@ private:
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
|
||||
std::shared_ptr<cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const int, cl::LocalSpaceArg> > ilu_decomp_k;
|
||||
|
||||
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||
|
||||
Reference in New Issue
Block a user