mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #2991 from Tongdongq/fgpilu_cpu_gpu
Added CPU and GPU implementations of Fine-Grained Parallel ILU (FGPILU)
This commit is contained in:
@@ -58,6 +58,7 @@ if(OPENCL_FOUND)
|
|||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
|
||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
|
||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
|
||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
|
||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
|
||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||||
@@ -168,6 +169,7 @@ list (APPEND PUBLIC_HEADER_FILES
|
|||||||
opm/simulators/linalg/bda/BlockedMatrix.hpp
|
opm/simulators/linalg/bda/BlockedMatrix.hpp
|
||||||
opm/simulators/linalg/bda/cuda_header.hpp
|
opm/simulators/linalg/bda/cuda_header.hpp
|
||||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||||
|
opm/simulators/linalg/bda/ChowPatelIlu.hpp
|
||||||
opm/simulators/linalg/bda/Reorder.hpp
|
opm/simulators/linalg/bda/Reorder.hpp
|
||||||
opm/simulators/linalg/bda/ILUReorder.hpp
|
opm/simulators/linalg/bda/ILUReorder.hpp
|
||||||
opm/simulators/linalg/bda/opencl.hpp
|
opm/simulators/linalg/bda/opencl.hpp
|
||||||
|
|||||||
@@ -25,12 +25,21 @@
|
|||||||
|
|
||||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
|
||||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||||
|
|
||||||
|
|
||||||
namespace bda
|
namespace bda
|
||||||
{
|
{
|
||||||
|
|
||||||
|
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
|
||||||
|
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
|
||||||
|
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
|
||||||
|
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
|
||||||
|
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
|
||||||
|
#define CHOW_PATEL 0
|
||||||
|
#define CHOW_PATEL_GPU 1
|
||||||
|
|
||||||
using Opm::OpmLog;
|
using Opm::OpmLog;
|
||||||
using Dune::Timer;
|
using Dune::Timer;
|
||||||
|
|
||||||
@@ -44,9 +53,11 @@ namespace bda
|
|||||||
{
|
{
|
||||||
delete[] invDiagVals;
|
delete[] invDiagVals;
|
||||||
delete[] diagIndex;
|
delete[] diagIndex;
|
||||||
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
delete[] toOrder;
|
delete[] toOrder;
|
||||||
delete[] fromOrder;
|
delete[] fromOrder;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||||
@@ -58,11 +69,18 @@ namespace bda
|
|||||||
this->nnz = mat->nnzbs * block_size * block_size;
|
this->nnz = mat->nnzbs * block_size * block_size;
|
||||||
this->nnzbs = mat->nnzbs;
|
this->nnzbs = mat->nnzbs;
|
||||||
|
|
||||||
|
int *CSCRowIndices = nullptr;
|
||||||
|
int *CSCColPointers = nullptr;
|
||||||
|
|
||||||
|
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||||
|
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
|
||||||
|
} else {
|
||||||
toOrder = new int[Nb];
|
toOrder = new int[Nb];
|
||||||
fromOrder = new int[Nb];
|
fromOrder = new int[Nb];
|
||||||
|
CSCRowIndices = new int[nnzbs];
|
||||||
int *CSCRowIndices = new int[nnzbs];
|
CSCColPointers = new int[Nb + 1];
|
||||||
int *CSCColPointers = new int[Nb + 1];
|
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
||||||
|
LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
|
||||||
|
|
||||||
Timer t_convert;
|
Timer t_convert;
|
||||||
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
|
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
|
||||||
@@ -71,10 +89,9 @@ namespace bda
|
|||||||
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Timer t_analysis;
|
Timer t_analysis;
|
||||||
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
|
||||||
LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
|
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||||
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
||||||
@@ -82,16 +99,29 @@ namespace bda
|
|||||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||||
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
||||||
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
||||||
|
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||||
|
out << "BILU0 reordering strategy: none\n";
|
||||||
|
// numColors = 1;
|
||||||
|
// rowsPerColor.emplace_back(Nb);
|
||||||
|
numColors = Nb;
|
||||||
|
for(int i = 0; i < Nb; ++i){
|
||||||
|
rowsPerColor.emplace_back(1);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
||||||
}
|
}
|
||||||
if(verbosity >= 3){
|
if(verbosity >= 3){
|
||||||
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
|
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
|
||||||
|
}
|
||||||
|
if(verbosity >= 2){
|
||||||
|
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
|
||||||
}
|
}
|
||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
|
|
||||||
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
delete[] CSCRowIndices;
|
delete[] CSCRowIndices;
|
||||||
delete[] CSCColPointers;
|
delete[] CSCColPointers;
|
||||||
|
}
|
||||||
|
|
||||||
diagIndex = new int[mat->Nb];
|
diagIndex = new int[mat->Nb];
|
||||||
invDiagVals = new double[mat->Nb * bs * bs];
|
invDiagVals = new double[mat->Nb * bs * bs];
|
||||||
@@ -99,8 +129,6 @@ namespace bda
|
|||||||
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||||
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||||
|
|
||||||
LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
|
|
||||||
|
|
||||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
|
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
|
||||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||||
@@ -129,11 +157,307 @@ namespace bda
|
|||||||
return true;
|
return true;
|
||||||
} // end init()
|
} // end init()
|
||||||
|
|
||||||
|
// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void BILU0<block_size>::chow_patel_decomposition()
|
||||||
|
{
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
int num_sweeps = 6;
|
||||||
|
|
||||||
|
// split matrix into L and U
|
||||||
|
// also convert U into BSC format (Ut)
|
||||||
|
// Ut stores diagonal for now
|
||||||
|
// original matrix LUmat is assumed to be symmetric
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
// verify that matrix is symmetric
|
||||||
|
for (int i = 0; i < Nb; ++i){
|
||||||
|
int iRowStart = LUmat->rowPointers[i];
|
||||||
|
int iRowEnd = LUmat->rowPointers[i + 1];
|
||||||
|
// for every block (i, j) in this row, check if (j, i) also exists
|
||||||
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||||
|
int j = LUmat->colIndices[ij];
|
||||||
|
int jRowStart = LUmat->rowPointers[j];
|
||||||
|
int jRowEnd = LUmat->rowPointers[j + 1];
|
||||||
|
bool blockFound = false;
|
||||||
|
// check all blocks on row j
|
||||||
|
// binary search is possible
|
||||||
|
for (int ji = jRowStart; ji < jRowEnd; ji++) {
|
||||||
|
int row = LUmat->colIndices[ji];
|
||||||
|
if (i == row) {
|
||||||
|
blockFound = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (false == blockFound) {
|
||||||
|
OPM_THROW(std::logic_error, "Error sparsity pattern must be symmetric when using chow_patel_decomposition()");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Ut is actually BSC format
|
||||||
|
std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Nb, (nnzbs + Nb) / 2);
|
||||||
|
|
||||||
|
Lmat->rowPointers[0] = 0;
|
||||||
|
for (int i = 0; i < Nb+1; i++) {
|
||||||
|
Ut->rowPointers[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Opm::Detail::Inverter<bs> inverter;
|
||||||
|
|
||||||
|
// store inverted diagonal
|
||||||
|
for (int i = 0; i < Nb; i++) {
|
||||||
|
int iRowStart = LUmat->rowPointers[i];
|
||||||
|
int iRowEnd = LUmat->rowPointers[i + 1];
|
||||||
|
// for every block in this row
|
||||||
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||||
|
int j = LUmat->colIndices[ij];
|
||||||
|
if (i == j) {
|
||||||
|
inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize initial guess for L: L_A * D
|
||||||
|
// L_A is strictly lower triangular part of A
|
||||||
|
// D is inv(diag(A))
|
||||||
|
int num_blocks_L = 0;
|
||||||
|
for (int i = 0; i < Nb; i++) {
|
||||||
|
int iRowStart = LUmat->rowPointers[i];
|
||||||
|
int iRowEnd = LUmat->rowPointers[i + 1];
|
||||||
|
// for every block in this row
|
||||||
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||||
|
int j = LUmat->colIndices[ij];
|
||||||
|
if (i <= j) {
|
||||||
|
Ut->rowPointers[j+1]++; // actually colPointers, now simply indicates how many blocks this col holds
|
||||||
|
} else {
|
||||||
|
Lmat->colIndices[num_blocks_L] = j;
|
||||||
|
// multiply block of L with corresponding diag block
|
||||||
|
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
|
||||||
|
num_blocks_L++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: copy all blocks for L at once, instead of copying each block individually
|
||||||
|
Lmat->rowPointers[i+1] = num_blocks_L;
|
||||||
|
}
|
||||||
|
|
||||||
|
// prefix sum to sum rowsizes into colpointers
|
||||||
|
std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers);
|
||||||
|
|
||||||
|
// initialize initial guess for U
|
||||||
|
for (int i = 0; i < Nb; i++) {
|
||||||
|
int iRowStart = LUmat->rowPointers[i];
|
||||||
|
int iRowEnd = LUmat->rowPointers[i + 1];
|
||||||
|
// for every block in this row
|
||||||
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||||
|
int j = LUmat->colIndices[ij];
|
||||||
|
if (i <= j){
|
||||||
|
int idx = Ut->rowPointers[j]++; // rowPointers[i] is (mis)used as the write offset of the current row i
|
||||||
|
Ut->colIndices[idx] = i; // actually rowIndices
|
||||||
|
memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// rotate
|
||||||
|
// the Ut->rowPointers were increased in the last loop
|
||||||
|
// now Ut->rowPointers[i+1] is at the same position as Ut->rowPointers[i] should have for a crs matrix. reset to correct expected value
|
||||||
|
for (int i = Nb; i > 0; --i) {
|
||||||
|
Ut->rowPointers[i] = Ut->rowPointers[i-1];
|
||||||
|
}
|
||||||
|
Ut->rowPointers[0] = 0;
|
||||||
|
|
||||||
|
|
||||||
|
// Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition
|
||||||
|
// U will be reversed because it is used with backwards substitution, the last row is used first
|
||||||
|
// Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
|
||||||
|
double *Utmp = new double[Ut->nnzbs * block_size * block_size];
|
||||||
|
|
||||||
|
// actual ILU decomposition
|
||||||
|
#if CHOW_PATEL_GPU
|
||||||
|
chowPatelIlu.decomposition(queue, context,
|
||||||
|
Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
|
||||||
|
Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
|
||||||
|
LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
|
||||||
|
Nb, num_sweeps, verbosity);
|
||||||
|
#else
|
||||||
|
double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
|
||||||
|
for (int sweep = 0; sweep < num_sweeps; ++sweep) {
|
||||||
|
|
||||||
|
// algorithm
|
||||||
|
// for every block in A (LUmat):
|
||||||
|
// if i > j:
|
||||||
|
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
|
||||||
|
// else:
|
||||||
|
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
|
||||||
|
|
||||||
|
// for every row
|
||||||
|
for (int row = 0; row < Nb; row++) {
|
||||||
|
// update U
|
||||||
|
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
|
||||||
|
int jColStart = Ut->rowPointers[row];
|
||||||
|
int jColEnd = Ut->rowPointers[row + 1];
|
||||||
|
int colU = row; // rename for clarity, next row in Ut means next col in U
|
||||||
|
// for every block in this row
|
||||||
|
for (int ij = jColStart; ij < jColEnd; ij++) {
|
||||||
|
int rowU1 = Ut->colIndices[ij]; // actually rowIndices for U
|
||||||
|
// refine Uij element (or diagonal)
|
||||||
|
int i1 = LUmat->rowPointers[rowU1];
|
||||||
|
int i2 = LUmat->rowPointers[rowU1+1];
|
||||||
|
|
||||||
|
// search on row rowU1, find blockIndex in LUmat of block with same col (colU) as Uij
|
||||||
|
// LUmat->nnzValues[kk] is block Aij
|
||||||
|
auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, colU);
|
||||||
|
assert(candidate != LUmat->colIndices + i2);
|
||||||
|
auto kk = candidate - LUmat->colIndices;
|
||||||
|
|
||||||
|
double aij[bs*bs];
|
||||||
|
// copy block to Aij so operations can be done on it without affecting LUmat
|
||||||
|
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
|
||||||
|
|
||||||
|
int jk = Lmat->rowPointers[rowU1]; // points to row rowU1 in L
|
||||||
|
// if row rowU1 is empty, skip row
|
||||||
|
if (jk < Lmat->rowPointers[rowU1+1]) {
|
||||||
|
int colL = Lmat->colIndices[jk];
|
||||||
|
// only check until block U(i,j) is reached
|
||||||
|
for (int k = jColStart; k < ij; ++k) {
|
||||||
|
int rowU2 = Ut->colIndices[k];
|
||||||
|
while (colL < rowU2) {
|
||||||
|
++jk; // check next block on row rowU1 of L
|
||||||
|
colL = Lmat->colIndices[jk];
|
||||||
|
}
|
||||||
|
if (colL == rowU2) {
|
||||||
|
// Aij -= (Lik * Ukj)
|
||||||
|
blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Uij_new = Aij - sum
|
||||||
|
memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// update L
|
||||||
|
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
|
||||||
|
int iRowStart = Lmat->rowPointers[row];
|
||||||
|
int iRowEnd = Lmat->rowPointers[row + 1];
|
||||||
|
|
||||||
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||||
|
int j = Lmat->colIndices[ij];
|
||||||
|
// refine Lij element
|
||||||
|
// search on row 'row', find blockIndex in LUmat of block with same col (j) as Lij
|
||||||
|
// LUmat->nnzValues[kk] is block Aij
|
||||||
|
int i1 = LUmat->rowPointers[row];
|
||||||
|
int i2 = LUmat->rowPointers[row+1];
|
||||||
|
|
||||||
|
auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, j);
|
||||||
|
assert(candidate != LUmat->colIndices + i2);
|
||||||
|
auto kk = candidate - LUmat->colIndices;
|
||||||
|
|
||||||
|
double aij[bs*bs];
|
||||||
|
// copy block to Aij so operations can be done on it without affecting LUmat
|
||||||
|
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
|
||||||
|
|
||||||
|
int jk = Ut->rowPointers[j]; // actually colPointers, jk points to col j in U
|
||||||
|
int rowU = Ut->colIndices[jk]; // actually rowIndices, rowU is the row of block jk
|
||||||
|
// only check until block L(i,j) is reached
|
||||||
|
for (int k = iRowStart; k < ij; ++k) {
|
||||||
|
int colL = Lmat->colIndices[k];
|
||||||
|
while (rowU < colL) {
|
||||||
|
++jk; // check next block on col j of U
|
||||||
|
rowU = Ut->colIndices[jk];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rowU == colL) {
|
||||||
|
// Aij -= (Lik * Ukj)
|
||||||
|
blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate (Aij - sum) / Ujj
|
||||||
|
double ujj[bs*bs];
|
||||||
|
inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
|
||||||
|
// Lij_new = (Aij - sum) / Ujj
|
||||||
|
blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 1st sweep writes to Ltmp
|
||||||
|
// 2nd sweep writes to Lmat->nnzValues
|
||||||
|
std::swap(Lmat->nnzValues, Ltmp);
|
||||||
|
std::swap(Ut->nnzValues, Utmp);
|
||||||
|
} // end sweep
|
||||||
|
|
||||||
|
// if number of sweeps is even, swap again so data is in Lmat->nnzValues
|
||||||
|
if (num_sweeps % 2 == 0) {
|
||||||
|
std::swap(Lmat->nnzValues, Ltmp);
|
||||||
|
std::swap(Ut->nnzValues, Utmp);
|
||||||
|
}
|
||||||
|
delete[] Ltmp;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// convert Ut to BSR
|
||||||
|
// diagonal stored separately
|
||||||
|
std::vector<int> ptr(Nb+1, 0);
|
||||||
|
std::vector<int> col(Ut->rowPointers[Nb]);
|
||||||
|
|
||||||
|
// count blocks per row for U (BSR)
|
||||||
|
// store diagonal in invDiagVals
|
||||||
|
for(int i = 0; i < Nb; ++i) {
|
||||||
|
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
|
||||||
|
int j = Ut->colIndices[k];
|
||||||
|
if (j == i) {
|
||||||
|
inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
|
||||||
|
} else {
|
||||||
|
++ptr[j+1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prefix sum
|
||||||
|
std::partial_sum(ptr.begin(), ptr.end(), ptr.begin());
|
||||||
|
|
||||||
|
// actually copy nonzero values for U
|
||||||
|
for(int i = 0; i < Nb; ++i) {
|
||||||
|
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
|
||||||
|
int j = Ut->colIndices[k];
|
||||||
|
if (j != i) {
|
||||||
|
int head = ptr[j]++;
|
||||||
|
col[head] = i;
|
||||||
|
memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the ptr[] were increased in the last loop
|
||||||
|
std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
|
||||||
|
ptr.front() = 0;
|
||||||
|
|
||||||
|
// reversing the rows of U, because that is the order they are used in during ILU apply
|
||||||
|
int URowIndex = 0;
|
||||||
|
int offsetU = 0; // number of nnz blocks that are already copied to Umat
|
||||||
|
Umat->rowPointers[0] = 0;
|
||||||
|
for (int i = LUmat->Nb - 1; i >= 0; i--) {
|
||||||
|
int rowSize = ptr[i + 1] - ptr[i]; // number of blocks in this row
|
||||||
|
memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
|
||||||
|
memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
|
||||||
|
offsetU += rowSize;
|
||||||
|
Umat->rowPointers[URowIndex + 1] = offsetU;
|
||||||
|
URowIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] Utmp;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
|
bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
|
||||||
{
|
{
|
||||||
const unsigned int bs = block_size;
|
const unsigned int bs = block_size;
|
||||||
|
auto *m = mat;
|
||||||
|
|
||||||
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
|
m = rmat.get();
|
||||||
Timer t_reorder;
|
Timer t_reorder;
|
||||||
reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
|
reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
|
||||||
|
|
||||||
@@ -142,10 +466,12 @@ namespace bda
|
|||||||
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
|
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
|
||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||||
|
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
|
||||||
Timer t_copy;
|
Timer t_copy;
|
||||||
memcpy(LUmat->nnzValues, rmat->nnzValues, sizeof(double) * bs * bs * rmat->nnzbs);
|
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
|
||||||
|
|
||||||
if (verbosity >= 3){
|
if (verbosity >= 3){
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
@@ -153,6 +479,10 @@ namespace bda
|
|||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Timer t_decomposition;
|
||||||
|
#if CHOW_PATEL
|
||||||
|
chow_patel_decomposition();
|
||||||
|
#else
|
||||||
int i, j, ij, ik, jk;
|
int i, j, ij, ik, jk;
|
||||||
int iRowStart, iRowEnd, jRowEnd;
|
int iRowStart, iRowEnd, jRowEnd;
|
||||||
double pivot[bs * bs];
|
double pivot[bs * bs];
|
||||||
@@ -160,8 +490,6 @@ namespace bda
|
|||||||
int LSize = 0;
|
int LSize = 0;
|
||||||
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
|
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
|
||||||
|
|
||||||
Timer t_decomposition;
|
|
||||||
|
|
||||||
// go through all rows
|
// go through all rows
|
||||||
for (i = 0; i < LUmat->Nb; i++) {
|
for (i = 0; i < LUmat->Nb; i++) {
|
||||||
iRowStart = LUmat->rowPointers[i];
|
iRowStart = LUmat->rowPointers[i];
|
||||||
@@ -239,6 +567,7 @@ namespace bda
|
|||||||
Umat->rowPointers[URowIndex + 1] = offsetU;
|
Umat->rowPointers[URowIndex + 1] = offsetU;
|
||||||
URowIndex++;
|
URowIndex++;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||||
@@ -278,6 +607,7 @@ namespace bda
|
|||||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||||
// event.wait();
|
// event.wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int color = numColors-1; color >= 0; --color){
|
for(int color = numColors-1; color >= 0; --color){
|
||||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||||
// event.wait();
|
// event.wait();
|
||||||
@@ -320,6 +650,7 @@ namespace bda
|
|||||||
template BILU0<n>::BILU0(ILUReorder, int); \
|
template BILU0<n>::BILU0(ILUReorder, int); \
|
||||||
template BILU0<n>::~BILU0(); \
|
template BILU0<n>::~BILU0(); \
|
||||||
template bool BILU0<n>::init(BlockedMatrix<n>*); \
|
template bool BILU0<n>::init(BlockedMatrix<n>*); \
|
||||||
|
template void BILU0<n>::chow_patel_decomposition(); \
|
||||||
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
|
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
|
||||||
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
|
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
|
||||||
template void BILU0<n>::setOpenCLContext(cl::Context*); \
|
template void BILU0<n>::setOpenCLContext(cl::Context*); \
|
||||||
|
|||||||
@@ -24,6 +24,7 @@
|
|||||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||||
|
|
||||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
|
||||||
|
|
||||||
namespace bda
|
namespace bda
|
||||||
{
|
{
|
||||||
@@ -67,6 +68,10 @@ namespace bda
|
|||||||
int lmem_per_work_group = 0;
|
int lmem_per_work_group = 0;
|
||||||
bool pattern_uploaded = false;
|
bool pattern_uploaded = false;
|
||||||
|
|
||||||
|
ChowPatelIlu chowPatelIlu;
|
||||||
|
|
||||||
|
void chow_patel_decomposition();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
|
BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
|
||||||
|
|||||||
@@ -58,6 +58,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
|
|||||||
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
|
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
|
||||||
} else if (opencl_ilu_reorder == "graph_coloring") {
|
} else if (opencl_ilu_reorder == "graph_coloring") {
|
||||||
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
||||||
|
} else if (opencl_ilu_reorder == "none") {
|
||||||
|
ilu_reorder = bda::ILUReorder::NONE;
|
||||||
} else {
|
} else {
|
||||||
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
|
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -93,11 +93,23 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// subtract c from b and store in a
|
||||||
|
// a = b - c
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void blockSub(double *a, double *b, double *c)
|
||||||
|
{
|
||||||
|
for (unsigned int row = 0; row < block_size; row++) {
|
||||||
|
for (unsigned int col = 0; col < block_size; col++) {
|
||||||
|
a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||||
template void sortBlockedRow<n>(int *, double *, int, int); \
|
template void sortBlockedRow<n>(int *, double *, int, int); \
|
||||||
template void blockMultSub<n>(double *, double *, double *); \
|
template void blockMultSub<n>(double *, double *, double *); \
|
||||||
template void blockMult<n>(double *, double *, double *); \
|
template void blockMult<n>(double *, double *, double *); \
|
||||||
|
template void blockSub<n>(double *, double *, double *); \
|
||||||
|
|
||||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||||
|
|||||||
@@ -109,6 +109,14 @@ void blockMultSub(double *a, double *b, double *c);
|
|||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void blockMult(double *mat1, double *mat2, double *resMat);
|
void blockMult(double *mat1, double *mat2, double *resMat);
|
||||||
|
|
||||||
|
/// Subtract blocks
|
||||||
|
/// a = b - c
|
||||||
|
/// \param[out] a result block
|
||||||
|
/// \param[in] b input block
|
||||||
|
/// \param[in] c input block
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void blockSub(double *a, double *b, double *c);
|
||||||
|
|
||||||
} // end namespace bda
|
} // end namespace bda
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
631
opm/simulators/linalg/bda/ChowPatelIlu.cpp
Normal file
631
opm/simulators/linalg/bda/ChowPatelIlu.cpp
Normal file
@@ -0,0 +1,631 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <dune/common/timer.hh>
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
using Opm::OpmLog;
|
||||||
|
|
||||||
|
// if PARALLEL is 0:
|
||||||
|
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
|
||||||
|
// Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
|
||||||
|
// except when the number of blocks in that row exceeds the number of workitems per workgroup.
|
||||||
|
// In that case some workitems will process multiple blocks sequentially.
|
||||||
|
// else:
|
||||||
|
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially
|
||||||
|
// Each block in a row gets a warp of 32 workitems, of which 9 are always active.
|
||||||
|
// Multiple blocks can be processed in parallel if a workgroup contains multiple warps.
|
||||||
|
// If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
|
||||||
|
|
||||||
|
// Notes:
|
||||||
|
// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
|
||||||
|
// PARALLEL 1 should be run with at least 32 workitems per workgroup.
|
||||||
|
// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
|
||||||
|
// PARALLEL 0 is generally faster, despite not having parallelization.
|
||||||
|
// only 3x3 blocks are supported
|
||||||
|
|
||||||
|
#define PARALLEL 0
|
||||||
|
|
||||||
|
#if PARALLEL
|
||||||
|
inline const char* chow_patel_ilu_sweep_s = R"(
|
||||||
|
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
|
||||||
|
// subtract blocks: a = a - b * c
|
||||||
|
// the output block has 9 entries, each entry is calculated by 1 thread
|
||||||
|
void blockMultSub(
|
||||||
|
__local double * restrict a,
|
||||||
|
__global const double * restrict b,
|
||||||
|
__global const double * restrict c)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
const unsigned int warp_size = 32;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||||||
|
if(thread_id_in_warp < block_size * block_size){
|
||||||
|
const unsigned int row = thread_id_in_warp / block_size;
|
||||||
|
const unsigned int col = thread_id_in_warp % block_size;
|
||||||
|
double temp = 0.0;
|
||||||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||||||
|
temp += b[block_size * row + k] * c[block_size * k + col];
|
||||||
|
}
|
||||||
|
a[block_size * row + col] -= temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// multiply blocks: resMat = mat1 * mat2
|
||||||
|
// the output block has 9 entries, each entry is calculated by 1 thread
|
||||||
|
void blockMult(
|
||||||
|
__local const double * restrict mat1,
|
||||||
|
__local const double * restrict mat2,
|
||||||
|
__global double * restrict resMat)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
const unsigned int warp_size = 32;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||||||
|
if(thread_id_in_warp < block_size * block_size){
|
||||||
|
const unsigned int row = thread_id_in_warp / block_size;
|
||||||
|
const unsigned int col = thread_id_in_warp % block_size;
|
||||||
|
double temp = 0.0;
|
||||||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||||||
|
temp += mat1[block_size * row + k] * mat2[block_size * k + col];
|
||||||
|
}
|
||||||
|
resMat[block_size * row + col] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// invert block: inverse = matrix^{-1}
|
||||||
|
// the output block has 9 entries, each entry is calculated by 1 thread
|
||||||
|
void invert(
|
||||||
|
__global const double * restrict matrix,
|
||||||
|
__local double * restrict inverse)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
const unsigned int bs = block_size; // rename to shorter name
|
||||||
|
const unsigned int warp_size = 32;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||||||
|
if(thread_id_in_warp < block_size * block_size){
|
||||||
|
// code generated by maple, copied from Dune::DenseMatrix
|
||||||
|
double t4 = matrix[0] * matrix[4];
|
||||||
|
double t6 = matrix[0] * matrix[5];
|
||||||
|
double t8 = matrix[1] * matrix[3];
|
||||||
|
double t10 = matrix[2] * matrix[3];
|
||||||
|
double t12 = matrix[1] * matrix[6];
|
||||||
|
double t14 = matrix[2] * matrix[6];
|
||||||
|
|
||||||
|
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
|
||||||
|
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
|
||||||
|
double t17 = 1.0 / det;
|
||||||
|
|
||||||
|
const unsigned int r = thread_id_in_warp / block_size;
|
||||||
|
const unsigned int c = thread_id_in_warp % block_size;
|
||||||
|
const unsigned int r1 = (r+1) % bs;
|
||||||
|
const unsigned int c1 = (c+1) % bs;
|
||||||
|
const unsigned int r2 = (r+bs-1) % bs;
|
||||||
|
const unsigned int c2 = (c+bs-1) % bs;
|
||||||
|
inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// perform the fixed-point iteration
|
||||||
|
// all entries in L and U are updated once
|
||||||
|
// output is written to [LU]tmp
|
||||||
|
// aij and ujj are local arrays whose size is specified before kernel launch
|
||||||
|
__kernel void chow_patel_ilu_sweep(
|
||||||
|
__global const double * restrict Ut_vals,
|
||||||
|
__global const double * restrict L_vals,
|
||||||
|
__global const double * restrict LU_vals,
|
||||||
|
__global const int * restrict Ut_rows,
|
||||||
|
__global const int * restrict L_rows,
|
||||||
|
__global const int * restrict LU_rows,
|
||||||
|
__global const int * restrict Ut_cols,
|
||||||
|
__global const int * restrict L_cols,
|
||||||
|
__global const int * restrict LU_cols,
|
||||||
|
__global double * restrict Ltmp,
|
||||||
|
__global double * restrict Utmp,
|
||||||
|
const int Nb,
|
||||||
|
__local double *aij,
|
||||||
|
__local double *ujj)
|
||||||
|
{
|
||||||
|
const int bs = 3;
|
||||||
|
const unsigned int warp_size = 32;
|
||||||
|
const unsigned int work_group_size = get_local_size(0);
|
||||||
|
const unsigned int idx_b = get_global_id(0) / work_group_size;
|
||||||
|
const unsigned int num_groups = get_num_groups(0);
|
||||||
|
const unsigned int warps_per_group = work_group_size / warp_size;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||||||
|
const unsigned int warp_id_in_group = idx_t / warp_size;
|
||||||
|
const unsigned int lmem_offset = warp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it
|
||||||
|
// every workitem in a warp has the same lmem_offset
|
||||||
|
|
||||||
|
// for every row of L or every col of U
|
||||||
|
for (int row = idx_b; row < Nb; row+=num_groups) {
|
||||||
|
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
|
||||||
|
int jColStart = Ut_rows[row]; // actually colPointers to U
|
||||||
|
int jColEnd = Ut_rows[row + 1];
|
||||||
|
// for every block on this column
|
||||||
|
for (int ij = jColStart + warp_id_in_group; ij < jColEnd; ij+=warps_per_group) {
|
||||||
|
int rowU1 = Ut_cols[ij]; // actually rowIndices for U
|
||||||
|
// refine Uij element (or diagonal)
|
||||||
|
int i1 = LU_rows[rowU1];
|
||||||
|
int i2 = LU_rows[rowU1+1];
|
||||||
|
int kk = 0;
|
||||||
|
// LUmat->nnzValues[kk] is block Aij
|
||||||
|
for(kk = i1; kk < i2; ++kk) {
|
||||||
|
int c = LU_cols[kk];
|
||||||
|
if (c >= row) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy block Aij so operations can be done on it without affecting LUmat
|
||||||
|
if(thread_id_in_warp < bs*bs){
|
||||||
|
aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
|
||||||
|
}
|
||||||
|
|
||||||
|
int jk = L_rows[rowU1]; // points to row rowU1 in L
|
||||||
|
// if row rowU1 is empty: skip row. The whole warp looks at the same row, so no divergence
|
||||||
|
if (jk < L_rows[rowU1+1]) {
|
||||||
|
int colL = L_cols[jk];
|
||||||
|
// only check until block U(i,j) is reached
|
||||||
|
for (int k = jColStart; k < ij; ++k) {
|
||||||
|
int rowU2 = Ut_cols[k]; // actually rowIndices for U
|
||||||
|
while (colL < rowU2) {
|
||||||
|
++jk; // check next block on row rowU1 of L
|
||||||
|
colL = L_cols[jk];
|
||||||
|
}
|
||||||
|
if (colL == rowU2) {
|
||||||
|
// Aij -= (Lik * Ukj)
|
||||||
|
blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Uij_new = Aij - sum
|
||||||
|
// write result of this sweep
|
||||||
|
if(thread_id_in_warp < bs*bs){
|
||||||
|
Utmp[ij*bs*bs + thread_id_in_warp] = aij[lmem_offset + thread_id_in_warp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update L
|
||||||
|
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
|
||||||
|
int iRowStart = L_rows[row];
|
||||||
|
int iRowEnd = L_rows[row + 1];
|
||||||
|
|
||||||
|
for (int ij = iRowStart + warp_id_in_group; ij < iRowEnd; ij+=warps_per_group) {
|
||||||
|
int j = L_cols[ij];
|
||||||
|
// // refine Lij element
|
||||||
|
int i1 = LU_rows[row];
|
||||||
|
int i2 = LU_rows[row+1];
|
||||||
|
int kk = 0;
|
||||||
|
// LUmat->nnzValues[kk] is block Aij
|
||||||
|
for(kk = i1; kk < i2; ++kk) {
|
||||||
|
int c = LU_cols[kk];
|
||||||
|
if (c >= j) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy block Aij so operations can be done on it without affecting LUmat
|
||||||
|
if(thread_id_in_warp < bs*bs){
|
||||||
|
aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
|
||||||
|
}
|
||||||
|
|
||||||
|
int jk = Ut_rows[j]; // actually colPointers, jk points to col j in U
|
||||||
|
int rowU = Ut_cols[jk]; // actually rowIndices, rowU is the row of block jk
|
||||||
|
// only check until block L(i,j) is reached
|
||||||
|
for (int k = iRowStart; k < ij; ++k) {
|
||||||
|
int colL = L_cols[k];
|
||||||
|
while(rowU < colL) {
|
||||||
|
++jk; // check next block on col j of U
|
||||||
|
rowU = Ut_cols[jk];
|
||||||
|
}
|
||||||
|
|
||||||
|
if(rowU == colL) {
|
||||||
|
// Aij -= (Lik * Ukj)
|
||||||
|
blockMultSub(aij+lmem_offset, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate 1 / Ujj
|
||||||
|
invert(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+lmem_offset);
|
||||||
|
|
||||||
|
// Lij_new = (Aij - sum) / Ujj
|
||||||
|
// write result of this sweep
|
||||||
|
blockMult(aij+lmem_offset, ujj+lmem_offset, Ltmp + ij * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
inline const char* chow_patel_ilu_sweep_s = R"(
|
||||||
|
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
|
||||||
|
// subtract blocks: a = a - b * c
|
||||||
|
// only one workitem performs this action
|
||||||
|
void blockMultSub(
|
||||||
|
__local double * restrict a,
|
||||||
|
__global const double * restrict b,
|
||||||
|
__global const double * restrict c)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
for (unsigned int row = 0; row < block_size; row++) {
|
||||||
|
for (unsigned int col = 0; col < block_size; col++) {
|
||||||
|
double temp = 0.0;
|
||||||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||||||
|
temp += b[block_size * row + k] * c[block_size * k + col];
|
||||||
|
}
|
||||||
|
a[block_size * row + col] -= temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// multiply blocks: resMat = mat1 * mat2
|
||||||
|
// only one workitem performs this action
|
||||||
|
void blockMult(
|
||||||
|
__local const double * restrict mat1,
|
||||||
|
__local const double * restrict mat2,
|
||||||
|
__global double * restrict resMat)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
for (unsigned int row = 0; row < block_size; row++) {
|
||||||
|
for (unsigned int col = 0; col < block_size; col++) {
|
||||||
|
double temp = 0.0;
|
||||||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||||||
|
temp += mat1[block_size * row + k] * mat2[block_size * k + col];
|
||||||
|
}
|
||||||
|
resMat[block_size * row + col] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// invert block: inverse = matrix^{-1}
|
||||||
|
// only one workitem performs this action
|
||||||
|
__kernel void inverter(
|
||||||
|
__global const double * restrict matrix,
|
||||||
|
__local double * restrict inverse)
|
||||||
|
{
|
||||||
|
// code generated by maple, copied from Dune::DenseMatrix
|
||||||
|
double t4 = matrix[0] * matrix[4];
|
||||||
|
double t6 = matrix[0] * matrix[5];
|
||||||
|
double t8 = matrix[1] * matrix[3];
|
||||||
|
double t10 = matrix[2] * matrix[3];
|
||||||
|
double t12 = matrix[1] * matrix[6];
|
||||||
|
double t14 = matrix[2] * matrix[6];
|
||||||
|
|
||||||
|
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
|
||||||
|
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
|
||||||
|
double t17 = 1.0 / det;
|
||||||
|
|
||||||
|
inverse[0] = (matrix[4] * matrix[8] - matrix[5] * matrix[7]) * t17;
|
||||||
|
inverse[1] = -(matrix[1] * matrix[8] - matrix[2] * matrix[7]) * t17;
|
||||||
|
inverse[2] = (matrix[1] * matrix[5] - matrix[2] * matrix[4]) * t17;
|
||||||
|
inverse[3] = -(matrix[3] * matrix[8] - matrix[5] * matrix[6]) * t17;
|
||||||
|
inverse[4] = (matrix[0] * matrix[8] - t14) * t17;
|
||||||
|
inverse[5] = -(t6 - t10) * t17;
|
||||||
|
inverse[6] = (matrix[3] * matrix[7] - matrix[4] * matrix[6]) * t17;
|
||||||
|
inverse[7] = -(matrix[0] * matrix[7] - t12) * t17;
|
||||||
|
inverse[8] = (t4 - t8) * t17;
|
||||||
|
}
|
||||||
|
|
||||||
|
// perform the fixed-point iteration
|
||||||
|
// all entries in L and U are updated once
|
||||||
|
// output is written to [LU]tmp
|
||||||
|
// aij and ujj are local arrays whose size is specified before kernel launch
|
||||||
|
__kernel void chow_patel_ilu_sweep(
|
||||||
|
__global const double * restrict Ut_vals,
|
||||||
|
__global const double * restrict L_vals,
|
||||||
|
__global const double * restrict LU_vals,
|
||||||
|
__global const int * restrict Ut_rows,
|
||||||
|
__global const int * restrict L_rows,
|
||||||
|
__global const int * restrict LU_rows,
|
||||||
|
__global const int * restrict Ut_cols,
|
||||||
|
__global const int * restrict L_cols,
|
||||||
|
__global const int * restrict LU_cols,
|
||||||
|
__global double * restrict Ltmp,
|
||||||
|
__global double * restrict Utmp,
|
||||||
|
const int Nb,
|
||||||
|
__local double *aij,
|
||||||
|
__local double *ujj)
|
||||||
|
{
|
||||||
|
const int bs = 3;
|
||||||
|
|
||||||
|
const unsigned int warp_size = 32;
|
||||||
|
const unsigned int work_group_size = get_local_size(0);
|
||||||
|
const unsigned int idx_b = get_global_id(0) / work_group_size;
|
||||||
|
const unsigned int num_groups = get_num_groups(0);
|
||||||
|
const unsigned int warps_per_group = work_group_size / warp_size;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||||||
|
const unsigned int warp_id_in_group = idx_t / warp_size;
|
||||||
|
|
||||||
|
// for every row of L or every col of U
|
||||||
|
for (int row = idx_b; row < Nb; row+=num_groups) {
|
||||||
|
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
|
||||||
|
int jColStart = Ut_rows[row]; // actually colPointers to U
|
||||||
|
int jColEnd = Ut_rows[row + 1];
|
||||||
|
// for every block on this column
|
||||||
|
for (int ij = jColStart + idx_t; ij < jColEnd; ij+=work_group_size) {
|
||||||
|
int rowU1 = Ut_cols[ij]; // actually rowIndices for U
|
||||||
|
// refine Uij element (or diagonal)
|
||||||
|
int i1 = LU_rows[rowU1];
|
||||||
|
int i2 = LU_rows[rowU1+1];
|
||||||
|
int kk = 0;
|
||||||
|
// LUmat->nnzValues[kk] is block Aij
|
||||||
|
for(kk = i1; kk < i2; ++kk) {
|
||||||
|
int c = LU_cols[kk];
|
||||||
|
if (c >= row) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy block Aij so operations can be done on it without affecting LUmat
|
||||||
|
for(int z = 0; z < bs*bs; ++z){
|
||||||
|
aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
|
||||||
|
}
|
||||||
|
|
||||||
|
int jk = L_rows[rowU1];
|
||||||
|
// if row rowU1 is empty: do not sum. The workitems have different rowU1 values, divergence is possible
|
||||||
|
int colL = (jk < L_rows[rowU1+1]) ? L_cols[jk] : Nb;
|
||||||
|
|
||||||
|
// only check until block U(i,j) is reached
|
||||||
|
for (int k = jColStart; k < ij; ++k) {
|
||||||
|
int rowU2 = Ut_cols[k]; // actually rowIndices for U
|
||||||
|
while (colL < rowU2) {
|
||||||
|
++jk; // check next block on row rowU1 of L
|
||||||
|
colL = L_cols[jk];
|
||||||
|
}
|
||||||
|
if (colL == rowU2) {
|
||||||
|
// Aij -= (Lik * Ukj)
|
||||||
|
blockMultSub(aij+idx_t*bs*bs, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Uij_new = Aij - sum
|
||||||
|
// write result of this sweep
|
||||||
|
for(int z = 0; z < bs*bs; ++z){
|
||||||
|
Utmp[ij*bs*bs + z] = aij[idx_t*bs*bs+z];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update L
|
||||||
|
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
|
||||||
|
int iRowStart = L_rows[row];
|
||||||
|
int iRowEnd = L_rows[row + 1];
|
||||||
|
|
||||||
|
for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=work_group_size) {
|
||||||
|
int j = L_cols[ij];
|
||||||
|
// // refine Lij element
|
||||||
|
int i1 = LU_rows[row];
|
||||||
|
int i2 = LU_rows[row+1];
|
||||||
|
int kk = 0;
|
||||||
|
// LUmat->nnzValues[kk] is block Aij
|
||||||
|
for(kk = i1; kk < i2; ++kk) {
|
||||||
|
int c = LU_cols[kk];
|
||||||
|
if (c >= j) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy block Aij so operations can be done on it without affecting LUmat
|
||||||
|
for(int z = 0; z < bs*bs; ++z){
|
||||||
|
aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
|
||||||
|
}
|
||||||
|
|
||||||
|
int jk = Ut_rows[j]; // actually colPointers, jk points to col j in U
|
||||||
|
int rowU = Ut_cols[jk]; // actually rowIndices, rowU is the row of block jk
|
||||||
|
// only check until block L(i,j) is reached
|
||||||
|
for (int k = iRowStart; k < ij; ++k) {
|
||||||
|
int colL = L_cols[k];
|
||||||
|
while(rowU < colL) {
|
||||||
|
++jk; // check next block on col j of U
|
||||||
|
rowU = Ut_cols[jk];
|
||||||
|
}
|
||||||
|
|
||||||
|
if(rowU == colL) {
|
||||||
|
// Aij -= (Lik * Ukj)
|
||||||
|
blockMultSub(aij+idx_t*bs*bs, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// calculate 1 / ujj
|
||||||
|
inverter(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+idx_t*bs*bs);
|
||||||
|
|
||||||
|
// Lij_new = (Aij - sum) / Ujj
|
||||||
|
// write result of this sweep
|
||||||
|
blockMult(aij+idx_t*bs*bs, ujj+idx_t*bs*bs, Ltmp + ij * bs * bs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void ChowPatelIlu::decomposition(
|
||||||
|
cl::CommandQueue *queue, cl::Context *context,
|
||||||
|
int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
|
||||||
|
int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
|
||||||
|
int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
|
||||||
|
int Nb, int num_sweeps, int verbosity)
|
||||||
|
{
|
||||||
|
const int block_size = 3;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// just put everything in the capture list
|
||||||
|
std::call_once(initialize_flag, [&](){
|
||||||
|
cl::Program::Sources source(1, std::make_pair(chow_patel_ilu_sweep_s, strlen(chow_patel_ilu_sweep_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
|
||||||
|
cl::Program program = cl::Program(*context, source, &err);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL could not create Program");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
|
||||||
|
program.build(devices);
|
||||||
|
|
||||||
|
chow_patel_ilu_sweep_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&,
|
||||||
|
const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "chow_patel_ilu_sweep", &err)));
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL could not create Kernel");
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate GPU memory
|
||||||
|
d_Ut_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
|
||||||
|
d_L_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
|
||||||
|
d_LU_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * LU_nnzbs * block_size * block_size);
|
||||||
|
d_Ut_ptrs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
|
||||||
|
d_L_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
|
||||||
|
d_LU_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
|
||||||
|
d_Ut_idxs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Ut_nnzbs);
|
||||||
|
d_L_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * L_nnzbs);
|
||||||
|
d_LU_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LU_nnzbs);
|
||||||
|
d_Ltmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
|
||||||
|
d_Utmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
|
||||||
|
|
||||||
|
Dune::Timer t_copy_pattern;
|
||||||
|
events.resize(6);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_FALSE, 0, sizeof(int) * (Nb+1), Ut_ptrs, nullptr, &events[0]);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_L_rows, CL_FALSE, 0, sizeof(int) * (Nb+1), L_rows, nullptr, &events[1]);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_LU_rows, CL_FALSE, 0, sizeof(int) * (Nb+1), LU_rows, nullptr, &events[2]);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_FALSE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs, nullptr, &events[3]);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_L_cols, CL_FALSE, 0, sizeof(int) * L_nnzbs, L_cols, nullptr, &events[4]);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_LU_cols, CL_FALSE, 0, sizeof(int) * LU_nnzbs, LU_cols, nullptr, &events[5]);
|
||||||
|
cl::WaitForEvents(events);
|
||||||
|
events.clear();
|
||||||
|
if (verbosity >= 3){
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "ChowPatelIlu copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
if (verbosity >= 2){
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "ChowPatelIlu PARALLEL: " << PARALLEL;
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
// copy to GPU
|
||||||
|
Dune::Timer t_copy1;
|
||||||
|
events.resize(3);
|
||||||
|
err = queue->enqueueWriteBuffer(d_Ut_vals, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_L_vals, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]);
|
||||||
|
err |= queue->enqueueWriteBuffer(d_LU_vals, CL_FALSE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals, nullptr, &events[2]);
|
||||||
|
cl::WaitForEvents(events);
|
||||||
|
events.clear();
|
||||||
|
if (verbosity >= 3){
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "ChowPatelIlu copy1 time: " << t_copy1.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||||
|
OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL enqueueWriteBuffer error");
|
||||||
|
}
|
||||||
|
|
||||||
|
// call kernel
|
||||||
|
for (int sweep = 0; sweep < num_sweeps; ++sweep) {
|
||||||
|
// normally, L_vals and Ltmp are swapped after the sweep is done
|
||||||
|
// these conditionals implement that without actually swapping pointers
|
||||||
|
// 1st sweep reads X_vals, writes to Xtmp
|
||||||
|
// 2nd sweep reads Xtmp, writes to X_vals
|
||||||
|
auto *Larg1 = (sweep % 2 == 0) ? &d_L_vals : &d_Ltmp;
|
||||||
|
auto *Larg2 = (sweep % 2 == 0) ? &d_Ltmp : &d_L_vals;
|
||||||
|
auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
|
||||||
|
auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
|
||||||
|
int num_work_groups = Nb;
|
||||||
|
#if PARALLEL
|
||||||
|
int work_group_size = 32;
|
||||||
|
#else
|
||||||
|
int work_group_size = 16;
|
||||||
|
#endif
|
||||||
|
int total_work_items = num_work_groups * work_group_size;
|
||||||
|
int lmem_per_work_group = work_group_size * block_size * block_size * sizeof(double);
|
||||||
|
Dune::Timer t_kernel;
|
||||||
|
event = (*chow_patel_ilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||||
|
*Uarg1, *Larg1, d_LU_vals,
|
||||||
|
d_Ut_ptrs, d_L_rows, d_LU_rows,
|
||||||
|
d_Ut_idxs, d_L_cols, d_LU_cols,
|
||||||
|
*Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group));
|
||||||
|
event.wait();
|
||||||
|
if (verbosity >= 3){
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "ChowPatelIlu sweep kernel time: " << t_kernel.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy back
|
||||||
|
Dune::Timer t_copy2;
|
||||||
|
events.resize(2);
|
||||||
|
if (num_sweeps % 2 == 0) {
|
||||||
|
err = queue->enqueueReadBuffer(d_Ut_vals, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]);
|
||||||
|
err |= queue->enqueueReadBuffer(d_L_vals, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]);
|
||||||
|
} else {
|
||||||
|
err = queue->enqueueReadBuffer(d_Utmp, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]);
|
||||||
|
err |= queue->enqueueReadBuffer(d_Ltmp, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]);
|
||||||
|
}
|
||||||
|
cl::WaitForEvents(events);
|
||||||
|
events.clear();
|
||||||
|
if (verbosity >= 3){
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "ChowPatelIlu copy2 time: " << t_copy2.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
// enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL
|
||||||
|
OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL enqueueReadBuffer error");
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (const cl::Error& error) {
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||||
|
oss << getErrorString(error.err()) << std::endl;
|
||||||
|
// rethrow exception
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
} catch (const std::logic_error& error) {
|
||||||
|
// rethrow exception by OPM_THROW in the try{}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} // end namespace bda
|
||||||
|
|
||||||
87
opm/simulators/linalg/bda/ChowPatelIlu.hpp
Normal file
87
opm/simulators/linalg/bda/ChowPatelIlu.hpp
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef CHOW_PATEL_ILU_HEADER_INCLUDED
|
||||||
|
#define CHOW_PATEL_ILU_HEADER_INCLUDED
|
||||||
|
|
||||||
|
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||||
|
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
// This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015:
|
||||||
|
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
|
||||||
|
// only blocksize == 3 is supported
|
||||||
|
// decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically
|
||||||
|
class ChowPatelIlu
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals;
|
||||||
|
cl::Buffer d_Ut_ptrs, d_Ut_idxs;
|
||||||
|
cl::Buffer d_L_rows, d_L_cols;
|
||||||
|
cl::Buffer d_LU_rows, d_LU_cols;
|
||||||
|
cl::Buffer d_Ltmp, d_Utmp;
|
||||||
|
|
||||||
|
cl::Event event;
|
||||||
|
std::vector<cl::Event> events;
|
||||||
|
cl_int err;
|
||||||
|
std::once_flag initialize_flag;
|
||||||
|
|
||||||
|
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&,
|
||||||
|
const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > chow_patel_ilu_sweep_k;
|
||||||
|
|
||||||
|
public:
|
||||||
|
/// Executes the ChowPatelIlu sweeps
|
||||||
|
/// also copies data from CPU to GPU and GPU to CPU
|
||||||
|
/// \param[in] queue OpenCL commandqueue
|
||||||
|
/// \param[in] context OpenCL context
|
||||||
|
/// \param[in] Ut_ptrs BSC columnpointers
|
||||||
|
/// \param[in] Ut_idxs BSC rowindices
|
||||||
|
/// \param[inout] Ut_vals actual nonzeros for U
|
||||||
|
/// \param[in] Ut_nnzbs number of blocks in U
|
||||||
|
/// \param[in] L_rows BSR rowpointers
|
||||||
|
/// \param[in] L_cols BSR columnindices
|
||||||
|
/// \param[inout] L_vals actual nonzeroes for L
|
||||||
|
/// \param[in] L_nnzbs number of blocks in L
|
||||||
|
/// \param[in] LU_rows BSR rowpointers
|
||||||
|
/// \param[in] LU_cols BSR columnindices
|
||||||
|
/// \param[in] LU_vals actual nonzeroes for LU (original matrix)
|
||||||
|
/// \param[in] LU_nnzbs number of blocks in LU
|
||||||
|
/// \param[in] Nb number of blockrows
|
||||||
|
/// \param[in] num_sweeps number of sweeps to be done
|
||||||
|
/// \param[in] verbosity print verbosity
|
||||||
|
void decomposition(
|
||||||
|
cl::CommandQueue *queue, cl::Context *context,
|
||||||
|
int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
|
||||||
|
int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
|
||||||
|
int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
|
||||||
|
int Nb, int num_sweeps, int verbosity);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
} // end namespace bda
|
||||||
|
|
||||||
|
#endif // CHOW_PATEL_ILU_HEADER_INCLUDED
|
||||||
@@ -27,7 +27,8 @@ namespace bda
|
|||||||
|
|
||||||
enum class ILUReorder {
|
enum class ILUReorder {
|
||||||
LEVEL_SCHEDULING,
|
LEVEL_SCHEDULING,
|
||||||
GRAPH_COLORING
|
GRAPH_COLORING,
|
||||||
|
NONE
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -126,6 +126,7 @@ public:
|
|||||||
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
||||||
/// Those indices need to be mapped via toOrder
|
/// Those indices need to be mapped via toOrder
|
||||||
/// \param[in] toOrder array with mappings
|
/// \param[in] toOrder array with mappings
|
||||||
|
/// \param[in] reorder whether reordering is actually used or not
|
||||||
void setReordering(int *toOrder, bool reorder);
|
void setReordering(int *toOrder, bool reorder);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -56,11 +56,6 @@ WellContributions::~WellContributions()
|
|||||||
|
|
||||||
if(num_std_wells > 0){
|
if(num_std_wells > 0){
|
||||||
delete[] val_pointers;
|
delete[] val_pointers;
|
||||||
#if HAVE_OPENCL
|
|
||||||
if(opencl_gpu){
|
|
||||||
delete[] h_toOrder;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HAVE_OPENCL
|
#if HAVE_OPENCL
|
||||||
@@ -79,8 +74,15 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
|
|||||||
this->queue = queue_;
|
this->queue = queue_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributions::setKernel(kernel_type *kernel_){
|
void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
|
||||||
this->kernel = kernel_;
|
this->kernel = kernel_;
|
||||||
|
this->kernel_no_reorder = kernel_no_reorder_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
|
||||||
|
{
|
||||||
|
this->h_toOrder = h_toOrder_;
|
||||||
|
this->reorder = reorder_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
|
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
|
||||||
@@ -90,29 +92,24 @@ void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffe
|
|||||||
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
||||||
|
|
||||||
cl::Event event;
|
cl::Event event;
|
||||||
|
if (reorder) {
|
||||||
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||||
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
|
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
|
||||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||||
|
} else {
|
||||||
|
event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||||
|
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl,
|
||||||
|
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||||
|
}
|
||||||
|
event.wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
|
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
||||||
if(h_x == nullptr){
|
if(h_x == nullptr){
|
||||||
h_x = new double[N];
|
h_x = new double[N];
|
||||||
h_y = new double[N];
|
h_y = new double[N];
|
||||||
}
|
}
|
||||||
|
|
||||||
if(h_toOrder == nullptr){
|
|
||||||
h_toOrder = new int[Nb];
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!read_toOrder){
|
|
||||||
events.resize(1);
|
|
||||||
queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]);
|
|
||||||
events[0].wait();
|
|
||||||
events.clear();
|
|
||||||
read_toOrder = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
events.resize(2);
|
events.resize(2);
|
||||||
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
|
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
|
||||||
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
|
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
|
||||||
@@ -121,7 +118,7 @@ void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer
|
|||||||
|
|
||||||
// actually apply MultisegmentWells
|
// actually apply MultisegmentWells
|
||||||
for(Opm::MultisegmentWellContribution *well: multisegments){
|
for(Opm::MultisegmentWellContribution *well: multisegments){
|
||||||
well->setReordering(h_toOrder, true);
|
well->setReordering(h_toOrder, reorder);
|
||||||
well->apply(h_x, h_y);
|
well->apply(h_x, h_y);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,7 +135,7 @@ void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrd
|
|||||||
}
|
}
|
||||||
|
|
||||||
if(num_ms_wells > 0){
|
if(num_ms_wells > 0){
|
||||||
apply_mswells(d_x, d_y, d_toOrder);
|
apply_mswells(d_x, d_y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -95,17 +95,21 @@ private:
|
|||||||
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
|
cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
|
||||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
|
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
|
||||||
|
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
const unsigned int, const unsigned int, cl::Buffer&,
|
||||||
|
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
|
||||||
|
|
||||||
cl::Context *context;
|
cl::Context *context;
|
||||||
cl::CommandQueue *queue;
|
cl::CommandQueue *queue;
|
||||||
kernel_type *kernel;
|
kernel_type *kernel;
|
||||||
|
kernel_type_no_reorder *kernel_no_reorder;
|
||||||
std::vector<cl::Event> events;
|
std::vector<cl::Event> events;
|
||||||
|
|
||||||
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
|
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
|
||||||
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
|
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
|
||||||
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
|
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
|
||||||
|
|
||||||
bool read_toOrder = false;
|
bool reorder = false;
|
||||||
int *h_toOrder = nullptr;
|
int *h_toOrder = nullptr;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -150,10 +154,16 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if HAVE_OPENCL
|
#if HAVE_OPENCL
|
||||||
void setKernel(kernel_type *kernel_);
|
void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
|
||||||
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
|
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
|
||||||
|
|
||||||
|
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
||||||
|
/// Those indices need to be mapped via toOrder
|
||||||
|
/// \param[in] toOrder array with mappings
|
||||||
|
/// \param[in] reorder whether reordering is actually used or not
|
||||||
|
void setReordering(int *toOrder, bool reorder);
|
||||||
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
||||||
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
|
||||||
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,9 @@ std::string getErrorString(cl_int error)
|
|||||||
case -67: return "CL_INVALID_LINKER_OPTIONS";
|
case -67: return "CL_INVALID_LINKER_OPTIONS";
|
||||||
case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
|
case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
|
||||||
|
|
||||||
|
// vendor specific errors
|
||||||
|
case -9999: return "NVIDIA_OPENCL_ILLEGAL_READ_OR_WRITE_TO_BUFFER";
|
||||||
|
|
||||||
default: return "UNKNOWN_CL_CODE";
|
default: return "UNKNOWN_CL_CODE";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -440,6 +440,79 @@ namespace bda
|
|||||||
}
|
}
|
||||||
)";
|
)";
|
||||||
|
|
||||||
|
inline const char* stdwell_apply_no_reorder_s = R"(
|
||||||
|
__kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
|
||||||
|
__global const double *Dnnzs,
|
||||||
|
__global const double *Bnnzs,
|
||||||
|
__global const int *Ccols,
|
||||||
|
__global const int *Bcols,
|
||||||
|
__global const double *x,
|
||||||
|
__global double *y,
|
||||||
|
const unsigned int dim,
|
||||||
|
const unsigned int dim_wells,
|
||||||
|
__global const unsigned int *val_pointers,
|
||||||
|
__local double *localSum,
|
||||||
|
__local double *z1,
|
||||||
|
__local double *z2){
|
||||||
|
int wgId = get_group_id(0);
|
||||||
|
int wiId = get_local_id(0);
|
||||||
|
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
||||||
|
int valsPerBlock = dim*dim_wells;
|
||||||
|
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
||||||
|
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
||||||
|
int c = wiId % dim;
|
||||||
|
int r = (wiId/dim) % dim_wells;
|
||||||
|
double temp;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
localSum[wiId] = 0;
|
||||||
|
if(wiId < numActiveWorkItems){
|
||||||
|
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||||
|
while(b < valSize + val_pointers[wgId]){
|
||||||
|
int colIdx = Bcols[b];
|
||||||
|
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||||
|
b += numBlocksPerWarp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(wiId < valsPerBlock){
|
||||||
|
localSum[wiId] += localSum[wiId + valsPerBlock];
|
||||||
|
}
|
||||||
|
|
||||||
|
b = wiId/valsPerBlock + val_pointers[wgId];
|
||||||
|
|
||||||
|
if(c == 0 && wiId < valsPerBlock){
|
||||||
|
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
||||||
|
localSum[wiId] += localSum[wiId + stride];
|
||||||
|
}
|
||||||
|
z1[r] = localSum[wiId];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < dim_wells){
|
||||||
|
temp = 0.0;
|
||||||
|
for(unsigned int i = 0; i < dim_wells; ++i){
|
||||||
|
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||||
|
}
|
||||||
|
z2[wiId] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < dim*valSize){
|
||||||
|
temp = 0.0;
|
||||||
|
int bb = wiId/dim + val_pointers[wgId];
|
||||||
|
for (unsigned int j = 0; j < dim_wells; ++j){
|
||||||
|
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||||
|
}
|
||||||
|
int colIdx = Ccols[bb];
|
||||||
|
y[colIdx*dim + c] -= temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
|
||||||
} // end namespace bda
|
} // end namespace bda
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -20,7 +20,6 @@
|
|||||||
#include <config.h>
|
#include <config.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
@@ -44,7 +43,7 @@ using Opm::OpmLog;
|
|||||||
using Dune::Timer;
|
using Dune::Timer;
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
|
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
|
||||||
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
|
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
|
||||||
|
|
||||||
cl_int err = CL_SUCCESS;
|
cl_int err = CL_SUCCESS;
|
||||||
@@ -318,7 +317,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
|||||||
double norm, norm_0;
|
double norm, norm_0;
|
||||||
|
|
||||||
if(wellContribs.getNumWells() > 0){
|
if(wellContribs.getNumWells() > 0){
|
||||||
wellContribs.setKernel(stdwell_apply_k.get());
|
wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||||
@@ -479,13 +478,13 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
||||||
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
||||||
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
|
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
|
||||||
|
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
|
||||||
program = cl::Program(*context, source);
|
program = cl::Program(*context, source);
|
||||||
program.build(devices);
|
program.build(devices);
|
||||||
|
|
||||||
prec->setOpenCLContext(context.get());
|
prec->setOpenCLContext(context.get());
|
||||||
prec->setOpenCLQueue(queue.get());
|
prec->setOpenCLQueue(queue.get());
|
||||||
|
|
||||||
rb = new double[N];
|
|
||||||
tmp = new double[N];
|
tmp = new double[N];
|
||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
vals_contiguous = new double[N];
|
vals_contiguous = new double[N];
|
||||||
@@ -508,7 +507,11 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
||||||
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||||
|
|
||||||
|
bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
|
||||||
|
if (reorder) {
|
||||||
|
rb = new double[N];
|
||||||
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
|
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
|
||||||
|
}
|
||||||
|
|
||||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||||
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||||
@@ -525,6 +528,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
const unsigned int, const unsigned int, cl::Buffer&,
|
const unsigned int, const unsigned int, cl::Buffer&,
|
||||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
|
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
|
||||||
|
stdwell_apply_no_reorder_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
const unsigned int, const unsigned int, cl::Buffer&,
|
||||||
|
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
||||||
|
|
||||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
||||||
|
|
||||||
@@ -544,7 +551,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
|||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void openclSolverBackend<block_size>::finalize() {
|
void openclSolverBackend<block_size>::finalize() {
|
||||||
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
delete[] rb;
|
delete[] rb;
|
||||||
|
}
|
||||||
delete[] tmp;
|
delete[] tmp;
|
||||||
#if COPY_ROW_BY_ROW
|
#if COPY_ROW_BY_ROW
|
||||||
delete[] vals_contiguous;
|
delete[] vals_contiguous;
|
||||||
@@ -572,7 +581,9 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
|||||||
queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
|
queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
|
||||||
queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
|
queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
|
||||||
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
|
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
|
||||||
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
|
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
|
||||||
|
}
|
||||||
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
|
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||||
event.wait();
|
event.wait();
|
||||||
|
|
||||||
@@ -624,9 +635,13 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
|
|||||||
int lmem_per_work_group = work_group_size * sizeof(double);
|
int lmem_per_work_group = work_group_size * sizeof(double);
|
||||||
prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
|
prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
|
||||||
|
|
||||||
|
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||||
|
rmat = mat.get();
|
||||||
|
} else {
|
||||||
toOrder = prec->getToOrder();
|
toOrder = prec->getToOrder();
|
||||||
fromOrder = prec->getFromOrder();
|
fromOrder = prec->getFromOrder();
|
||||||
rmat = prec->getRMat();
|
rmat = prec->getRMat();
|
||||||
|
}
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
@@ -641,11 +656,17 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
|
|||||||
|
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
|
void openclSolverBackend<block_size>::update_system(double *vals, double *b, WellContributions &wellContribs) {
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
mat->nnzValues = vals;
|
mat->nnzValues = vals;
|
||||||
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
|
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
|
||||||
|
wellContribs.setReordering(toOrder, true);
|
||||||
|
} else {
|
||||||
|
rb = b;
|
||||||
|
wellContribs.setReordering(nullptr, false);
|
||||||
|
}
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
@@ -703,8 +724,12 @@ template <unsigned int block_size>
|
|||||||
void openclSolverBackend<block_size>::get_result(double *x) {
|
void openclSolverBackend<block_size>::get_result(double *x) {
|
||||||
Timer t;
|
Timer t;
|
||||||
|
|
||||||
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
|
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
|
||||||
reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
|
reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
|
||||||
|
} else {
|
||||||
|
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
|
||||||
|
}
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
@@ -723,13 +748,13 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
|
|||||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
update_system(vals, b);
|
update_system(vals, b, wellContribs);
|
||||||
if (!create_preconditioner()) {
|
if (!create_preconditioner()) {
|
||||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||||
}
|
}
|
||||||
copy_system_to_gpu();
|
copy_system_to_gpu();
|
||||||
} else {
|
} else {
|
||||||
update_system(vals, b);
|
update_system(vals, b, wellContribs);
|
||||||
if (!create_preconditioner()) {
|
if (!create_preconditioner()) {
|
||||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size>
|
|||||||
using Base::initialized;
|
using Base::initialized;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
double *rb = nullptr; // reordered b vector, the matrix is reordered, so b must also be
|
double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
|
||||||
double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||||
|
|
||||||
// OpenCL variables must be reusable, they are initialized in initialize()
|
// OpenCL variables must be reusable, they are initialized in initialize()
|
||||||
@@ -59,7 +59,7 @@ private:
|
|||||||
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
|
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
|
||||||
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
|
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
|
||||||
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
||||||
cl::Buffer d_toOrder;
|
cl::Buffer d_toOrder; // only used when reordering is used
|
||||||
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
|
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
|
||||||
|
|
||||||
// shared pointers are also passed to other objects
|
// shared pointers are also passed to other objects
|
||||||
@@ -76,12 +76,17 @@ private:
|
|||||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
const unsigned int, const unsigned int, cl::Buffer&,
|
const unsigned int, const unsigned int, cl::Buffer&,
|
||||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
|
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
|
||||||
|
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||||
|
const unsigned int, const unsigned int, cl::Buffer&,
|
||||||
|
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
|
||||||
|
|
||||||
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
||||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||||
bool analysis_done = false;
|
bool analysis_done = false;
|
||||||
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
|
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
|
||||||
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix, used for spmv
|
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
|
||||||
|
ILUReorder opencl_ilu_reorder; // reordering strategy
|
||||||
|
|
||||||
/// Divide A by B, and round up: return (int)ceil(A/B)
|
/// Divide A by B, and round up: return (int)ceil(A/B)
|
||||||
/// \param[in] A dividend
|
/// \param[in] A dividend
|
||||||
@@ -151,7 +156,8 @@ private:
|
|||||||
/// Reorder the linear system so it corresponds with the coloring
|
/// Reorder the linear system so it corresponds with the coloring
|
||||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
/// \param[in] b input vectors, contains N values
|
/// \param[in] b input vectors, contains N values
|
||||||
void update_system(double *vals, double *b);
|
/// \param[out] wellContribs WellContributions, to set reordering
|
||||||
|
void update_system(double *vals, double *b, WellContributions &wellContribs);
|
||||||
|
|
||||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||||
void update_system_on_gpu();
|
void update_system_on_gpu();
|
||||||
|
|||||||
Reference in New Issue
Block a user