opm-simulators/opm/simulators/linalg/bda/FPGABILU0.cpp

416 lines
17 KiB
C++

/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/MatrixBlock.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace bda
{
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
FPGABILU0<block_size>::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) :
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_)
{
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
level_scheduling = true;
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
graph_coloring = true;
} else {
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
}
}
template <unsigned int block_size>
FPGABILU0<block_size>::~FPGABILU0()
{
delete[] invDiagVals;
}
template <unsigned int block_size>
bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
{
const unsigned int bs = block_size;
resultPointers.resize(numResultPointers, nullptr);
resultSizes.resize(numResultSizes);
// Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added.
const unsigned int nnzSplit = 1;
this->N = mat->Nb * block_size;
this->Nb = mat->Nb;
this->nnz = mat->nnzbs * block_size * block_size;
this->nnzbs = mat->nnzbs;
toOrder.resize(Nb);
fromOrder.resize(Nb);
std::vector<int> CSCRowIndices(nnzbs);
std::vector<int> CSCColPointers(Nb + 1);
if (level_scheduling) {
Timer t_convert;
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str());
}
}
Timer t_analysis;
rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
std::ostringstream out;
if (level_scheduling) {
out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (graph_coloring) {
out << "FPGABILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<bs>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
}
if (numColors > maxNumColors) {
std::ostringstream errorstring;
errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n";
OPM_THROW(std::logic_error, errorstring.str());
}
if (verbosity >= 3) {
out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
}
OpmLog::info(out.str());
int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0;
int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
int blockDiagSize = 0;
// This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
int doneRows = 0;
for (int c = 0; c < numColors; c++) {
for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) {
int columnIndex = rMat->colIndices[j];
if (columnIndex < i) {
LColorRoundedValSize += 9;
LNROffsetSize += 9;
}
if (columnIndex > i) {
UColorRoundedValSize += 9;
UNROffsetSize += 9;
}
colorRoundedValSize += 9;
NROffsetSize += 9;
}
blockDiagSize += 12;
}
// End of color: round all sizes to nearest cacheline
colorRoundedValSize = roundUpTo(colorRoundedValSize, 32);
LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32);
UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32);
NROffsetSize = roundUpTo(NROffsetSize, 64);
LNROffsetSize = roundUpTo(LNROffsetSize, 64);
UNROffsetSize = roundUpTo(UNROffsetSize, 64);
blockDiagSize = roundUpTo(blockDiagSize, 8);
doneRows += rowsPerColor[c];
}
int colorSizesNum = 8 + roundUpTo(4 * numColors, 16);
int worstCaseColumnAccessNum = numColors * maxColsPerColor;
nnzValues.resize(nnzSplit, std::vector<double>(colorRoundedValSize));
LnnzValues.resize(nnzSplit, std::vector<double>(LColorRoundedValSize));
UnnzValues.resize(nnzSplit, std::vector<double>(UColorRoundedValSize));
// initial number of nnz, used to allocate
nnzValsSizes.resize(nnzSplit, colorRoundedValSize);
LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize);
UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize);
colIndices.resize(colorRoundedValSize);
LColIndices.resize(LColorRoundedValSize);
UColIndices.resize(UColorRoundedValSize);
NROffsets.resize(NROffsetSize);
LNROffsets.resize(LNROffsetSize);
UNROffsets.resize(UNROffsetSize);
PIndicesAddr.resize(worstCaseColumnAccessNum);
LPIndicesAddr.resize(worstCaseColumnAccessNum);
UPIndicesAddr.resize(worstCaseColumnAccessNum);
colorSizes.resize(colorSizesNum);
LColorSizes.resize(colorSizesNum);
UColorSizes.resize(colorSizesNum);
blockDiag.resize(blockDiagSize);
colIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
LColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
UColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(),
maxRowsPerColor, maxColsPerColor,
colIndicesInColor, PIndicesAddr.data(), colorSizes.data(),
LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(),
UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data());
if (err != 0) {
std::ostringstream errorstring;
errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n";
OPM_THROW(std::logic_error, errorstring.str());
}
diagIndex.resize(mat->Nb, 0);
invDiagVals = new double[mat->Nb * bs * bs];
LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
resultPointers[0] = (void *) colorSizes.data();
resultPointers[1] = (void *) PIndicesAddr.data();
resultPointers[2] = (void *) nnzValues.data();
resultPointers[3] = (void *) colIndices.data();
resultPointers[4] = (void *) NROffsets.data();
resultPointers[5] = (void *) nnzValsSizes.data();
resultPointers[6] = (void *) LColorSizes.data();
resultPointers[7] = (void *) LPIndicesAddr.data();
resultPointers[8] = (void *) LnnzValues.data();
resultPointers[9] = (void *) LColIndices.data();
resultPointers[10] = (void *) LNROffsets.data();
resultPointers[11] = (void *) LnnzValsSizes.data();
resultPointers[12] = (void *) UColorSizes.data();
resultPointers[13] = (void *) UPIndicesAddr.data();
resultPointers[14] = (void *) UnnzValues.data();
resultPointers[15] = (void *) UColIndices.data();
resultPointers[16] = (void *) UNROffsets.data();
resultPointers[17] = (void *) UnnzValsSizes.data();
resultPointers[18] = (void *) blockDiag.data();
//resultPointers[19] and [20] are set by the caller
resultSizes[0] = mat->Nb * block_size;
resultSizes[1] = colorRoundedValSize; // zeropadded valSize;
resultSizes[2] = numColors;
resultSizes[3] = worstCaseColumnAccessNum; //totalCols
resultSizes[4] = NROffsetSize; //NRFlagSize
resultSizes[5] = blockDiagSize; //diagValsSize
resultSizes[6] = mat->Nb * block_size;
resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize;
resultSizes[8] = numColors;
resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols
resultSizes[10] = LNROffsetSize; //LNRFlagSize
resultSizes[11] = blockDiagSize; //LDiagValsSize
resultSizes[12] = mat->Nb * block_size;
resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize;
resultSizes[14] = numColors;
resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols
resultSizes[16] = UNROffsetSize; //UNRFlagSize
resultSizes[17] = blockDiagSize; //UDiagValsSize
return true;
} // end init()
template <unsigned int block_size>
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
{
const unsigned int bs = block_size;
Timer t_reorder;
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
}
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
Timer t_memcpy;
memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s";
OpmLog::info(out.str());
}
int i, j, ij, ik, jk;
int iRowStart, iRowEnd, jRowEnd;
double pivot[bs * bs];
int LSize = 0;
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
Timer t_decomposition;
// go through all rows
for (i = 0; i < LUMat->Nb; i++) {
iRowStart = LUMat->rowPointers[i];
iRowEnd = LUMat->rowPointers[i + 1];
// go through all elements of the row
for (ij = iRowStart; ij < iRowEnd; ij++) {
j = LUMat->colIndices[ij];
// if the element is the diagonal, store the index and go to next row
if (j == i) {
diagIndex[i] = ij;
break;
}
// if an element beyond the diagonal is reach, no diagonal was found
// throw an error now. TODO: perform reordering earlier to prevent this
if (j > i) {
std::ostringstream out;
out << "BILU0 Error could not find diagonal value in row: " << i;
OpmLog::error(out.str());
return false;
}
LSize++;
// calculate the pivot of this row
blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
jRowEnd = LUMat->rowPointers[j + 1];
jk = diagIndex[j] + 1;
ik = ij + 1;
// substract that row scaled by the pivot from this row.
while (ik < iRowEnd && jk < jRowEnd) {
if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
ik++;
jk++;
} else {
if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
{ ik++; }
else
{ jk++; }
}
}
}
// store the inverse in the diagonal!
inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
}
LMat->rowPointers[0] = 0;
UMat->rowPointers[0] = 0;
// Split the LU matrix into two by comparing column indices to diagonal indices
for (i = 0; i < LUMat->Nb; i++) {
LMat->rowPointers[i + 1] = LMat->rowPointers[i];
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
if (j < diagIndex[i]) {
memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j];
LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1;
}
}
}
// Reverse the order or the (blocked) rows for the U matrix,
// because the rows are accessed in reverse order when applying the ILU0
int URowIndex = 0;
for (i = LUMat->Nb - 1; i >= 0; i--) {
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex];
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
if (j > diagIndex[i]) {
memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j];
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1;
}
}
URowIndex++;
}
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
std::vector<int> URowsPerColor(numColors);
rowSize = block_size * rMat->Nb;
LRowSize = block_size * LMat->Nb;
URowSize = block_size * UMat->Nb;
LNumColors = numColors;
UNumColors = numColors;
for (int c = 0; c < numColors; c++) {
URowsPerColor[numColors - c - 1] = rowsPerColor[c];
}
int err;
err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false,
colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(),
nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize);
if (err != 0) {
return false;
}
err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false,
LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(),
LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize);
if (err != 0) {
return false;
}
err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true,
UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(),
UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize);
if (err != 0) {
return false;
}
blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data());
// resultPointers are set in the init method
resultSizes[0] = rowSize;
resultSizes[1] = colorSizes[3]; // zeropadded valSize;
resultSizes[2] = numColors;
resultSizes[3] = colorSizes[2]; //totalCols
resultSizes[4] = colorSizes[5]; //NRFlagSize
resultSizes[5] = colorSizes[6]; //diagValsSize
resultSizes[6] = LRowSize;
resultSizes[7] = LColorSizes[3]; // zeropadded LValSize;
resultSizes[8] = LNumColors;
resultSizes[9] = LColorSizes[2]; //LTotalCols
resultSizes[10] = LColorSizes[5]; //LNRFlagSize
resultSizes[11] = LColorSizes[6]; //LDiagValsSize
resultSizes[12] = URowSize;
resultSizes[13] = UColorSizes[3]; // zeropadded UValSize;
resultSizes[14] = UNumColors;
resultSizes[15] = UColorSizes[2]; //UTotalCols
resultSizes[16] = UColorSizes[5]; //UNRFlagSize
resultSizes[17] = UColorSizes[6]; //UDiagValsSize
return true;
} // end create_preconditioner()
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int); \
template FPGABILU0<n>::~FPGABILU0(); \
template bool FPGABILU0<n>::init(BlockedMatrix<n> *); \
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace bda