2020-06-22 11:26:49 -05:00
|
|
|
/*
|
|
|
|
Copyright 2019 Equinor ASA
|
|
|
|
|
|
|
|
This file is part of the Open Porous Media project (OPM).
|
|
|
|
|
|
|
|
OPM is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
OPM is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2020-06-25 11:44:49 -05:00
|
|
|
#include <config.h>
|
2020-06-22 11:26:49 -05:00
|
|
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
|
|
|
#include <opm/common/ErrorMacros.hpp>
|
2020-07-02 04:08:18 -05:00
|
|
|
#include <opm/simulators/linalg/MatrixBlock.hpp>
|
2020-07-01 07:50:34 -05:00
|
|
|
#include <dune/common/timer.hh>
|
2020-06-22 11:26:49 -05:00
|
|
|
|
|
|
|
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
|
|
|
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
2020-12-17 07:49:59 -06:00
|
|
|
#include <opm/simulators/linalg/bda/fgpilu.hpp>
|
2020-06-22 11:26:49 -05:00
|
|
|
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
|
|
|
|
|
|
|
|
|
|
|
namespace bda
|
|
|
|
{
|
|
|
|
|
2020-12-17 07:49:59 -06:00
|
|
|
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
|
|
|
|
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
|
|
|
|
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
|
|
|
|
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
|
|
|
|
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
|
2020-12-17 08:50:53 -06:00
|
|
|
#define CHOW_PATEL 0
|
2020-12-17 07:49:59 -06:00
|
|
|
#define CHOW_PATEL_GPU 1
|
|
|
|
|
2020-06-22 11:26:49 -05:00
|
|
|
using Opm::OpmLog;
|
2020-07-01 07:50:34 -05:00
|
|
|
using Dune::Timer;
|
2020-06-24 12:48:50 -05:00
|
|
|
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
2020-10-16 08:05:02 -05:00
|
|
|
BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
|
|
|
|
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
|
|
|
|
{}
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
|
|
|
BILU0<block_size>::~BILU0()
|
2020-06-22 11:26:49 -05:00
|
|
|
{
|
|
|
|
delete[] invDiagVals;
|
|
|
|
delete[] diagIndex;
|
2021-01-11 09:51:05 -06:00
|
|
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
|
|
|
delete[] toOrder;
|
|
|
|
delete[] fromOrder;
|
|
|
|
}
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
2020-07-03 07:10:31 -05:00
|
|
|
bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
2020-06-22 11:26:49 -05:00
|
|
|
{
|
2020-06-25 04:51:41 -05:00
|
|
|
const unsigned int bs = block_size;
|
2020-06-22 11:26:49 -05:00
|
|
|
|
|
|
|
this->N = mat->Nb * block_size;
|
|
|
|
this->Nb = mat->Nb;
|
|
|
|
this->nnz = mat->nnzbs * block_size * block_size;
|
|
|
|
this->nnzbs = mat->nnzbs;
|
|
|
|
|
2021-01-11 09:51:05 -06:00
|
|
|
int *CSCRowIndices = nullptr;
|
|
|
|
int *CSCColPointers = nullptr;
|
2020-07-03 07:10:31 -05:00
|
|
|
|
2021-01-11 09:51:05 -06:00
|
|
|
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
|
|
|
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
|
|
|
|
} else {
|
|
|
|
toOrder = new int[Nb];
|
|
|
|
fromOrder = new int[Nb];
|
|
|
|
CSCRowIndices = new int[nnzbs];
|
|
|
|
CSCColPointers = new int[Nb + 1];
|
|
|
|
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
|
|
|
LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
|
|
|
|
|
|
|
|
Timer t_convert;
|
|
|
|
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
|
|
|
|
if(verbosity >= 3){
|
|
|
|
std::ostringstream out;
|
|
|
|
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
|
|
|
OpmLog::info(out.str());
|
|
|
|
}
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
|
2020-07-01 07:50:34 -05:00
|
|
|
Timer t_analysis;
|
2020-10-15 03:52:10 -05:00
|
|
|
std::ostringstream out;
|
2020-10-16 08:05:02 -05:00
|
|
|
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
2020-10-15 03:52:10 -05:00
|
|
|
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
2020-07-07 04:00:38 -05:00
|
|
|
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
2020-10-16 08:05:02 -05:00
|
|
|
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
2020-10-15 03:52:10 -05:00
|
|
|
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
2020-07-07 04:00:38 -05:00
|
|
|
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
2020-12-17 07:49:59 -06:00
|
|
|
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
|
|
|
out << "BILU0 reordering strategy: none\n";
|
2021-01-11 09:51:05 -06:00
|
|
|
// numColors = 1;
|
|
|
|
// rowsPerColor.emplace_back(Nb);
|
|
|
|
numColors = Nb;
|
2020-12-17 07:49:59 -06:00
|
|
|
for(int i = 0; i < Nb; ++i){
|
2021-01-11 09:51:05 -06:00
|
|
|
rowsPerColor.emplace_back(1);
|
2020-12-17 07:49:59 -06:00
|
|
|
}
|
2020-10-15 03:52:10 -05:00
|
|
|
} else {
|
|
|
|
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
if(verbosity >= 3){
|
2020-12-17 07:49:59 -06:00
|
|
|
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
|
|
|
|
}
|
|
|
|
if(verbosity >= 2){
|
|
|
|
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
2020-10-15 03:52:10 -05:00
|
|
|
OpmLog::info(out.str());
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2021-01-11 09:51:05 -06:00
|
|
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
|
|
|
delete[] CSCRowIndices;
|
|
|
|
delete[] CSCColPointers;
|
|
|
|
}
|
2020-07-03 07:10:31 -05:00
|
|
|
|
2020-06-22 11:26:49 -05:00
|
|
|
diagIndex = new int[mat->Nb];
|
2020-06-25 04:51:41 -05:00
|
|
|
invDiagVals = new double[mat->Nb * bs * bs];
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-07-03 07:10:31 -05:00
|
|
|
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
|
|
|
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-07-01 08:16:14 -05:00
|
|
|
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
|
|
|
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
|
|
|
|
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
|
|
|
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs);
|
|
|
|
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
|
|
|
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1));
|
2020-06-25 04:51:41 -05:00
|
|
|
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
|
2020-06-22 11:26:49 -05:00
|
|
|
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
|
|
|
|
|
2020-07-01 08:16:14 -05:00
|
|
|
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
|
|
|
|
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
|
|
|
|
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
|
|
|
|
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
|
|
|
|
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
|
|
|
|
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
|
2020-06-25 04:51:41 -05:00
|
|
|
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals);
|
2020-06-22 11:26:49 -05:00
|
|
|
|
|
|
|
int *rowsPerColorPrefix = new int[numColors + 1];
|
2020-08-05 14:17:57 -05:00
|
|
|
rowsPerColorPrefix[0] = 0;
|
|
|
|
for (int i = 0; i < numColors; ++i) {
|
2020-08-07 02:38:10 -05:00
|
|
|
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
|
|
|
|
delete[] rowsPerColorPrefix;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
} // end init()
|
|
|
|
|
2020-12-17 07:49:59 -06:00
|
|
|
// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
|
|
|
|
template <unsigned int block_size>
|
|
|
|
void BILU0<block_size>::chow_patel_decomposition()
|
|
|
|
{
|
|
|
|
const unsigned int bs = block_size;
|
|
|
|
int num_sweeps = 6;
|
|
|
|
|
|
|
|
// split matrix into L and U
|
|
|
|
// also convert U into BSC format (Ut)
|
|
|
|
// Ut stores diagonal for now
|
|
|
|
int num_blocks_L = 0;
|
|
|
|
|
|
|
|
// Ut is actually BSC format
|
|
|
|
std::unique_ptr<BlockedMatrix<bs> > Ut = std::make_unique<BlockedMatrix<bs> >(Umat->Nb, Umat->nnzbs + Umat->Nb);
|
|
|
|
|
|
|
|
Lmat->rowPointers[0] = 0;
|
|
|
|
for (int i = 0; i < Nb+1; i++) {
|
|
|
|
Ut->rowPointers[i] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// for every row
|
|
|
|
for (int i = 0; i < Nb; i++) {
|
|
|
|
int iRowStart = LUmat->rowPointers[i];
|
|
|
|
int iRowEnd = LUmat->rowPointers[i + 1];
|
|
|
|
// for every block in this row
|
|
|
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
|
|
|
int j = LUmat->colIndices[ij];
|
|
|
|
if (i <= j) {
|
|
|
|
Ut->rowPointers[j+1]++; // actually colPointers
|
|
|
|
} else {
|
|
|
|
Lmat->colIndices[num_blocks_L] = j;
|
|
|
|
memcpy(Lmat->nnzValues + num_blocks_L * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
|
|
|
|
num_blocks_L++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Lmat->rowPointers[i+1] = num_blocks_L;
|
|
|
|
}
|
|
|
|
|
|
|
|
// prefix sum
|
|
|
|
int sum = 0;
|
|
|
|
for (int i = 1; i < Nb+1; i++) {
|
|
|
|
sum += Ut->rowPointers[i];
|
|
|
|
Ut->rowPointers[i] = sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
// for every row
|
|
|
|
for (int i = 0; i < Nb; i++) {
|
|
|
|
int iRowStart = LUmat->rowPointers[i];
|
|
|
|
int iRowEnd = LUmat->rowPointers[i + 1];
|
|
|
|
// for every block in this row
|
|
|
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
|
|
|
int j = LUmat->colIndices[ij];
|
|
|
|
if (i <= j){
|
|
|
|
int idx = Ut->rowPointers[j]++;
|
|
|
|
Ut->colIndices[idx] = i; // actually rowIndices
|
|
|
|
memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// rotate
|
|
|
|
// the Ut->rowPointers were increased in the last loop
|
|
|
|
for (int i = Nb; i > 0; --i) {
|
|
|
|
Ut->rowPointers[i] = Ut->rowPointers[i-1];
|
|
|
|
}
|
|
|
|
Ut->rowPointers[0] = 0;
|
|
|
|
|
|
|
|
Opm::Detail::Inverter<bs> inverter;
|
|
|
|
|
|
|
|
// Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed at the end
|
|
|
|
// Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp
|
|
|
|
double *Utmp = new double[Ut->nnzbs * block_size * block_size];
|
|
|
|
|
|
|
|
// actual ILU decomposition
|
|
|
|
#if CHOW_PATEL_GPU
|
|
|
|
fgpilu.decomposition(queue, context,
|
|
|
|
Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs,
|
|
|
|
Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs,
|
|
|
|
LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs,
|
|
|
|
Nb, num_sweeps, verbosity);
|
|
|
|
#else
|
|
|
|
double *Ltmp = new double[Lmat->nnzbs * block_size * block_size];
|
|
|
|
for (int sweep = 0; sweep < num_sweeps; ++sweep) {
|
|
|
|
|
|
|
|
// for every row
|
|
|
|
for (int row = 0; row < Nb; row++) {
|
|
|
|
// update U
|
|
|
|
int jColStart = Ut->rowPointers[row];
|
|
|
|
int jColEnd = Ut->rowPointers[row + 1];
|
|
|
|
// for every block in this row
|
|
|
|
for (int ij = jColStart; ij < jColEnd; ij++) {
|
|
|
|
int col = Ut->colIndices[ij];
|
|
|
|
// refine Uij element (or diagonal)
|
|
|
|
int i1 = LUmat->rowPointers[col];
|
|
|
|
int i2 = LUmat->rowPointers[col+1];
|
|
|
|
int kk = 0;
|
|
|
|
for(kk = i1; kk < i2; ++kk) {
|
|
|
|
ptrdiff_t c = LUmat->colIndices[kk];
|
|
|
|
if (c >= row) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
double aij[bs*bs];
|
|
|
|
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
|
|
|
|
int jk = Lmat->rowPointers[col];
|
|
|
|
int ik = (jk < Lmat->rowPointers[col+1]) ? Lmat->colIndices[jk] : Nb;
|
|
|
|
|
|
|
|
for (int k = jColStart; k < ij; ++k) {
|
|
|
|
int ki = Ut->colIndices[k];
|
|
|
|
while (ik < ki) {
|
|
|
|
++jk;
|
|
|
|
ik = Lmat->colIndices[jk];
|
|
|
|
}
|
|
|
|
if (ik == ki) {
|
|
|
|
blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs);
|
|
|
|
}
|
|
|
|
|
|
|
|
// update L
|
|
|
|
int iRowStart = Lmat->rowPointers[row];
|
|
|
|
int iRowEnd = Lmat->rowPointers[row + 1];
|
|
|
|
|
|
|
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
|
|
|
int j = Lmat->colIndices[ij];
|
|
|
|
// refine Lij element
|
|
|
|
int i1 = LUmat->rowPointers[row];
|
|
|
|
int i2 = LUmat->rowPointers[row+1];
|
|
|
|
int kk = 0;
|
|
|
|
for(kk = i1; kk < i2; ++kk) {
|
|
|
|
ptrdiff_t c = LUmat->colIndices[kk];
|
|
|
|
if (c >= j) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
double aij[bs*bs];
|
|
|
|
memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs);
|
|
|
|
int jk = Ut->rowPointers[j];
|
|
|
|
int ik = Ut->colIndices[jk];
|
|
|
|
for (int k = iRowStart; k < ij; ++k) {
|
|
|
|
int ki = Lmat->colIndices[k];
|
|
|
|
while(ik < ki) {
|
|
|
|
++jk;
|
|
|
|
ik = Ut->colIndices[jk];
|
|
|
|
}
|
|
|
|
|
|
|
|
if(ik == ki) {
|
|
|
|
blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// calculate aij / ujj
|
|
|
|
double ujj[bs*bs];
|
|
|
|
inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
|
|
|
|
// lij = aij / ujj
|
|
|
|
blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
double *t = Lmat->nnzValues;
|
|
|
|
Lmat->nnzValues = Ltmp;
|
|
|
|
Ltmp = t;
|
|
|
|
t = Ut->nnzValues;
|
|
|
|
Ut->nnzValues = Utmp;
|
|
|
|
Utmp = t;
|
|
|
|
} // end sweep
|
|
|
|
delete[] Ltmp;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// convert Ut to BSR
|
|
|
|
// diagonal stored separately
|
|
|
|
std::vector<int> ptr(Nb+1, 0);
|
|
|
|
std::vector<int> col(Ut->rowPointers[Nb]);
|
|
|
|
|
|
|
|
// count blocks per row for U (BSR)
|
|
|
|
// store diagonal in invDiagVals
|
|
|
|
for(int i = 0; i < Nb; ++i) {
|
|
|
|
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
|
|
|
|
int j = Ut->colIndices[k];
|
|
|
|
if (j == i) {
|
|
|
|
inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
|
|
|
|
} else {
|
|
|
|
++ptr[j+1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// prefix sum
|
|
|
|
int sumU = 0;
|
|
|
|
for (int i = 1; i < Nb+1; i++) {
|
|
|
|
sumU += ptr[i];
|
|
|
|
ptr[i] = sumU;
|
|
|
|
}
|
|
|
|
|
|
|
|
// actually copy nonzero values for U
|
|
|
|
for(int i = 0; i < Nb; ++i) {
|
|
|
|
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
|
|
|
|
int j = Ut->colIndices[k];
|
|
|
|
if (j != i) {
|
|
|
|
int head = ptr[j]++;
|
|
|
|
col[head] = i;
|
|
|
|
memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// the ptr[] were increased in the last loop
|
|
|
|
std::rotate(ptr.begin(), ptr.end() - 1, ptr.end());
|
|
|
|
ptr.front() = 0;
|
|
|
|
|
|
|
|
// reversing the rows of U, because that is the order they are used in
|
|
|
|
int URowIndex = 0;
|
|
|
|
int offsetU = 0; // number of nnz blocks that are already copied to Umat
|
|
|
|
Umat->rowPointers[0] = 0;
|
|
|
|
for (int i = LUmat->Nb - 1; i >= 0; i--) {
|
|
|
|
int rowSize = ptr[i + 1] - ptr[i]; // number of blocks in this row
|
|
|
|
memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize);
|
|
|
|
memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize);
|
|
|
|
offsetU += rowSize;
|
|
|
|
Umat->rowPointers[URowIndex + 1] = offsetU;
|
|
|
|
URowIndex++;
|
|
|
|
}
|
|
|
|
|
|
|
|
delete[] Utmp;
|
|
|
|
|
|
|
|
}
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
2020-07-03 07:10:31 -05:00
|
|
|
bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
|
2020-06-22 11:26:49 -05:00
|
|
|
{
|
2020-06-25 04:51:41 -05:00
|
|
|
const unsigned int bs = block_size;
|
2021-01-11 09:51:05 -06:00
|
|
|
auto *m = mat;
|
2020-07-03 07:10:31 -05:00
|
|
|
|
2021-01-11 09:51:05 -06:00
|
|
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
|
|
|
m = rmat.get();
|
|
|
|
Timer t_reorder;
|
|
|
|
reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
|
|
|
|
|
|
|
|
if (verbosity >= 3){
|
|
|
|
std::ostringstream out;
|
|
|
|
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
|
|
|
|
OpmLog::info(out.str());
|
|
|
|
}
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
2021-01-11 09:51:05 -06:00
|
|
|
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
|
2020-07-01 07:50:34 -05:00
|
|
|
Timer t_copy;
|
2021-01-11 09:51:05 -06:00
|
|
|
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
|
2020-07-03 07:10:31 -05:00
|
|
|
|
2020-06-22 11:26:49 -05:00
|
|
|
if (verbosity >= 3){
|
|
|
|
std::ostringstream out;
|
2020-07-01 07:50:34 -05:00
|
|
|
out << "BILU0 memcpy: " << t_copy.stop() << " s";
|
2020-06-22 11:26:49 -05:00
|
|
|
OpmLog::info(out.str());
|
|
|
|
}
|
|
|
|
|
2020-12-17 07:49:59 -06:00
|
|
|
Timer t_decomposition;
|
|
|
|
#if CHOW_PATEL
|
|
|
|
chow_patel_decomposition();
|
|
|
|
#else
|
2020-06-22 11:26:49 -05:00
|
|
|
int i, j, ij, ik, jk;
|
|
|
|
int iRowStart, iRowEnd, jRowEnd;
|
2020-06-25 04:51:41 -05:00
|
|
|
double pivot[bs * bs];
|
2020-06-22 11:26:49 -05:00
|
|
|
|
|
|
|
int LSize = 0;
|
2020-07-02 04:08:18 -05:00
|
|
|
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
|
2020-06-22 11:26:49 -05:00
|
|
|
|
|
|
|
// go through all rows
|
2020-07-01 08:16:14 -05:00
|
|
|
for (i = 0; i < LUmat->Nb; i++) {
|
|
|
|
iRowStart = LUmat->rowPointers[i];
|
|
|
|
iRowEnd = LUmat->rowPointers[i + 1];
|
2020-06-22 11:26:49 -05:00
|
|
|
|
|
|
|
// go through all elements of the row
|
|
|
|
for (ij = iRowStart; ij < iRowEnd; ij++) {
|
2020-07-01 08:16:14 -05:00
|
|
|
j = LUmat->colIndices[ij];
|
2020-06-22 11:26:49 -05:00
|
|
|
// if the element is the diagonal, store the index and go to next row
|
|
|
|
if (j == i) {
|
|
|
|
diagIndex[i] = ij;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// if an element beyond the diagonal is reach, no diagonal was found
|
|
|
|
// throw an error now. TODO: perform reordering earlier to prevent this
|
|
|
|
if (j > i) {
|
|
|
|
std::ostringstream out;
|
|
|
|
out << "BILU0 Error could not find diagonal value in row: " << i;
|
|
|
|
OpmLog::error(out.str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
LSize++;
|
|
|
|
// calculate the pivot of this row
|
2020-07-01 08:16:14 -05:00
|
|
|
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-07-01 08:16:14 -05:00
|
|
|
memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size);
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-07-01 08:16:14 -05:00
|
|
|
jRowEnd = LUmat->rowPointers[j + 1];
|
2020-06-22 11:26:49 -05:00
|
|
|
jk = diagIndex[j] + 1;
|
|
|
|
ik = ij + 1;
|
|
|
|
// substract that row scaled by the pivot from this row.
|
|
|
|
while (ik < iRowEnd && jk < jRowEnd) {
|
2020-07-01 08:16:14 -05:00
|
|
|
if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) {
|
|
|
|
blockMultSub<bs>(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs);
|
2020-06-22 11:26:49 -05:00
|
|
|
ik++;
|
|
|
|
jk++;
|
|
|
|
} else {
|
2020-07-01 08:16:14 -05:00
|
|
|
if (LUmat->colIndices[ik] < LUmat->colIndices[jk])
|
2020-06-22 11:26:49 -05:00
|
|
|
{ ik++; }
|
|
|
|
else
|
|
|
|
{ jk++; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// store the inverse in the diagonal!
|
2020-07-02 04:08:18 -05:00
|
|
|
inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-07-01 08:16:14 -05:00
|
|
|
memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
|
2020-07-01 08:16:14 -05:00
|
|
|
Lmat->rowPointers[0] = 0;
|
|
|
|
Umat->rowPointers[0] = 0;
|
2020-06-22 11:26:49 -05:00
|
|
|
|
|
|
|
// Split the LU matrix into two by comparing column indices to diagonal indices
|
2020-07-01 08:16:14 -05:00
|
|
|
for (i = 0; i < LUmat->Nb; i++) {
|
|
|
|
int offsetL = Lmat->rowPointers[i];
|
|
|
|
int rowSize = diagIndex[i] - LUmat->rowPointers[i];
|
|
|
|
int offsetLU = LUmat->rowPointers[i];
|
|
|
|
memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
|
|
|
|
memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
|
2020-06-22 11:26:49 -05:00
|
|
|
offsetL += rowSize;
|
2020-07-01 08:16:14 -05:00
|
|
|
Lmat->rowPointers[i + 1] = offsetL;
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
// Reverse the order or the (blocked) rows for the U matrix,
|
|
|
|
// because the rows are accessed in reverse order when applying the ILU0
|
|
|
|
int URowIndex = 0;
|
2020-07-01 08:16:14 -05:00
|
|
|
for (i = LUmat->Nb - 1; i >= 0; i--) {
|
|
|
|
int offsetU = Umat->rowPointers[URowIndex];
|
|
|
|
int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1;
|
2020-06-22 11:26:49 -05:00
|
|
|
int offsetLU = diagIndex[i] + 1;
|
2020-07-01 08:16:14 -05:00
|
|
|
memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize);
|
|
|
|
memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize);
|
2020-06-22 11:26:49 -05:00
|
|
|
offsetU += rowSize;
|
2020-07-01 08:16:14 -05:00
|
|
|
Umat->rowPointers[URowIndex + 1] = offsetU;
|
2020-06-22 11:26:49 -05:00
|
|
|
URowIndex++;
|
|
|
|
}
|
2020-12-17 07:49:59 -06:00
|
|
|
#endif
|
2020-06-22 11:26:49 -05:00
|
|
|
if (verbosity >= 3) {
|
|
|
|
std::ostringstream out;
|
2020-07-01 07:50:34 -05:00
|
|
|
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
2020-06-22 11:26:49 -05:00
|
|
|
OpmLog::info(out.str());
|
|
|
|
}
|
|
|
|
|
2020-07-01 07:50:34 -05:00
|
|
|
Timer t_copyToGpu;
|
2020-06-22 11:26:49 -05:00
|
|
|
if (pattern_uploaded == false) {
|
2020-07-01 08:16:14 -05:00
|
|
|
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices);
|
|
|
|
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices);
|
|
|
|
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers);
|
|
|
|
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers);
|
2020-06-22 11:26:49 -05:00
|
|
|
pattern_uploaded = true;
|
|
|
|
}
|
2020-07-01 08:16:14 -05:00
|
|
|
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues);
|
|
|
|
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues);
|
2020-06-25 04:51:41 -05:00
|
|
|
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals);
|
2020-06-22 11:26:49 -05:00
|
|
|
if (verbosity >= 3) {
|
|
|
|
std::ostringstream out;
|
2020-07-01 07:50:34 -05:00
|
|
|
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
|
2020-06-22 11:26:49 -05:00
|
|
|
OpmLog::info(out.str());
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
} // end create_preconditioner()
|
|
|
|
|
|
|
|
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
|
|
|
|
// however, if individual kernel calls are timed, waiting for events is needed
|
|
|
|
// behavior on other GPUs is untested
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
|
|
|
void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
|
2020-06-22 11:26:49 -05:00
|
|
|
{
|
|
|
|
cl::Event event;
|
2020-07-01 07:50:34 -05:00
|
|
|
Timer t_apply;
|
2020-06-22 11:26:49 -05:00
|
|
|
|
2020-06-25 02:40:20 -05:00
|
|
|
for(int color = 0; color < numColors; ++color){
|
2020-06-22 11:26:49 -05:00
|
|
|
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
|
|
|
// event.wait();
|
|
|
|
}
|
2020-12-17 07:49:59 -06:00
|
|
|
|
2020-06-22 11:26:49 -05:00
|
|
|
for(int color = numColors-1; color >= 0; --color){
|
|
|
|
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
|
|
|
// event.wait();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (verbosity >= 3) {
|
|
|
|
event.wait();
|
|
|
|
std::ostringstream out;
|
2020-07-01 07:50:34 -05:00
|
|
|
out << "BILU0 apply: " << t_apply.stop() << " s";
|
2020-06-22 11:26:49 -05:00
|
|
|
OpmLog::info(out.str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
2020-06-25 02:40:20 -05:00
|
|
|
void BILU0<block_size>::setOpenCLContext(cl::Context *context_){
|
|
|
|
this->context = context_;
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
2020-06-25 02:40:20 -05:00
|
|
|
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_){
|
|
|
|
this->queue = queue_;
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
2020-06-25 02:40:20 -05:00
|
|
|
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_){
|
|
|
|
this->work_group_size = work_group_size_;
|
|
|
|
this->total_work_items = total_work_items_;
|
|
|
|
this->lmem_per_work_group = lmem_per_work_group_;
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
2020-06-24 13:09:14 -05:00
|
|
|
template <unsigned int block_size>
|
|
|
|
void BILU0<block_size>::setKernels(
|
2020-06-23 04:30:15 -05:00
|
|
|
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
|
|
|
|
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_
|
2020-06-22 11:26:49 -05:00
|
|
|
){
|
2020-06-23 04:30:15 -05:00
|
|
|
this->ILU_apply1 = ILU_apply1_;
|
|
|
|
this->ILU_apply2 = ILU_apply2_;
|
2020-06-22 11:26:49 -05:00
|
|
|
}
|
|
|
|
|
2020-06-24 13:09:14 -05:00
|
|
|
|
|
|
|
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
2020-10-16 08:05:02 -05:00
|
|
|
template BILU0<n>::BILU0(ILUReorder, int); \
|
2020-07-01 12:43:22 -05:00
|
|
|
template BILU0<n>::~BILU0(); \
|
2020-07-03 07:10:31 -05:00
|
|
|
template bool BILU0<n>::init(BlockedMatrix<n>*); \
|
2020-12-17 07:49:59 -06:00
|
|
|
template void BILU0<n>::chow_patel_decomposition(); \
|
2020-07-03 07:10:31 -05:00
|
|
|
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
|
2020-06-24 13:09:14 -05:00
|
|
|
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
|
|
|
|
template void BILU0<n>::setOpenCLContext(cl::Context*); \
|
|
|
|
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
|
|
|
|
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
|
|
|
|
template void BILU0<n>::setKernels( \
|
|
|
|
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
|
|
|
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> * \
|
|
|
|
);
|
|
|
|
|
|
|
|
INSTANTIATE_BDA_FUNCTIONS(1);
|
|
|
|
INSTANTIATE_BDA_FUNCTIONS(2);
|
|
|
|
INSTANTIATE_BDA_FUNCTIONS(3);
|
|
|
|
INSTANTIATE_BDA_FUNCTIONS(4);
|
|
|
|
|
|
|
|
#undef INSTANTIATE_BDA_FUNCTIONS
|
|
|
|
|
2020-06-22 11:26:49 -05:00
|
|
|
} // end namespace bda
|
|
|
|
|
|
|
|
|