opm-simulators/opm/simulators/linalg/bda/FPGABILU0.cpp

/*
  Copyright 2020 Equinor ASA

  This file is part of the Open Porous Media project (OPM).

  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <config.h>

#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/MatrixBlock.hpp>
#include <dune/common/timer.hh>

#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>

namespace bda
{

using Opm::OpmLog;
using Dune::Timer;

template <unsigned int block_size>
FPGABILU0<block_size>::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) :
    verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_)
{
    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
        level_scheduling = true;
    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
        graph_coloring = true;
    } else {
        OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
    }
}


template <unsigned int block_size>
FPGABILU0<block_size>::~FPGABILU0()
{
    delete[] invDiagVals;
}


template <unsigned int block_size>
bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
{
    const unsigned int bs = block_size;

    resultPointers.resize(numResultPointers, nullptr);
    resultSizes.resize(numResultSizes);

    // Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added.
    const unsigned int nnzSplit = 1;

    this->N = mat->Nb * block_size;
    this->Nb = mat->Nb;
    this->nnz = mat->nnzbs * block_size * block_size;
    this->nnzbs = mat->nnzbs;

    toOrder.resize(Nb);
    fromOrder.resize(Nb);

    std::vector<int> CSCRowIndices(nnzbs);
    std::vector<int> CSCColPointers(Nb + 1);

    if (level_scheduling) {
        Timer t_convert;
        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
        if (verbosity >= 3) {
            std::ostringstream out;
            out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s";
            OpmLog::info(out.str());
        }
    }

    Timer t_analysis;
    rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
    LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
    std::ostringstream out;
    if (level_scheduling) {
        out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
        findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
    } else if (graph_coloring) {
        out << "FPGABILU0 reordering strategy: " << "graph_coloring\n";
        findGraphColoring<bs>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
    }

    if (numColors > maxNumColors) {
        std::ostringstream errorstring;
        errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n";
        OPM_THROW(std::logic_error, errorstring.str());
    }

    if (verbosity >= 3) {
        out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
    }
    OpmLog::info(out.str());

    int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0;
    int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
    int blockDiagSize = 0;
    // This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
    int doneRows = 0;
    for (int c = 0; c < numColors; c++) {
        for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
            for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) {
                int columnIndex = rMat->colIndices[j];
                if (columnIndex < i) {
                    LColorRoundedValSize += 9;
                    LNROffsetSize += 9;
                }
                if (columnIndex > i) {
                    UColorRoundedValSize += 9;
                    UNROffsetSize += 9;
                }
                colorRoundedValSize += 9;
                NROffsetSize += 9;
            }
            blockDiagSize += 12;
        }
        // End of color: round all sizes to nearest cacheline
        colorRoundedValSize = roundUpTo(colorRoundedValSize, 32);
        LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32);
        UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32);
        NROffsetSize = roundUpTo(NROffsetSize, 64);
        LNROffsetSize = roundUpTo(LNROffsetSize, 64);
        UNROffsetSize = roundUpTo(UNROffsetSize, 64);
        blockDiagSize = roundUpTo(blockDiagSize, 8);

        doneRows += rowsPerColor[c];
    }
    int colorSizesNum = 8 + roundUpTo(4 * numColors, 16);
    int worstCaseColumnAccessNum = numColors * maxColsPerColor;

    nnzValues.resize(nnzSplit, std::vector<double>(colorRoundedValSize));
    LnnzValues.resize(nnzSplit, std::vector<double>(LColorRoundedValSize));
    UnnzValues.resize(nnzSplit, std::vector<double>(UColorRoundedValSize));
    // initial number of nnz, used to allocate
    nnzValsSizes.resize(nnzSplit, colorRoundedValSize);
    LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize);
    UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize);
    colIndices.resize(colorRoundedValSize);
    LColIndices.resize(LColorRoundedValSize);
    UColIndices.resize(UColorRoundedValSize);
    NROffsets.resize(NROffsetSize);
    LNROffsets.resize(LNROffsetSize);
    UNROffsets.resize(UNROffsetSize);
    PIndicesAddr.resize(worstCaseColumnAccessNum);
    LPIndicesAddr.resize(worstCaseColumnAccessNum);
    UPIndicesAddr.resize(worstCaseColumnAccessNum);
    colorSizes.resize(colorSizesNum);
    LColorSizes.resize(colorSizesNum);
    UColorSizes.resize(colorSizesNum);
    blockDiag.resize(blockDiagSize);
    colIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
    LColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
    UColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));

    int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(),
                                         maxRowsPerColor, maxColsPerColor,
                                         colIndicesInColor, PIndicesAddr.data(), colorSizes.data(),
                                         LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(),
                                         UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data());
    if (err != 0) {
        std::ostringstream errorstring;
        errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n";
        OPM_THROW(std::logic_error, errorstring.str());
    }

    diagIndex.resize(mat->Nb, 0);
    invDiagVals = new double[mat->Nb * bs * bs];
    LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
    UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
    resultPointers[0] = (void *) colorSizes.data();
    resultPointers[1] = (void *) PIndicesAddr.data();
    resultPointers[2] = (void *) nnzValues.data();
    resultPointers[3] = (void *) colIndices.data();
    resultPointers[4] = (void *) NROffsets.data();
    resultPointers[5] = (void *) nnzValsSizes.data();
    resultPointers[6] = (void *) LColorSizes.data();
    resultPointers[7] = (void *) LPIndicesAddr.data();
    resultPointers[8] = (void *) LnnzValues.data();
    resultPointers[9] = (void *) LColIndices.data();
    resultPointers[10] = (void *) LNROffsets.data();
    resultPointers[11] = (void *) LnnzValsSizes.data();
    resultPointers[12] = (void *) UColorSizes.data();
    resultPointers[13] = (void *) UPIndicesAddr.data();
    resultPointers[14] = (void *) UnnzValues.data();
    resultPointers[15] = (void *) UColIndices.data();
    resultPointers[16] = (void *) UNROffsets.data();
    resultPointers[17] = (void *) UnnzValsSizes.data();
    resultPointers[18] = (void *) blockDiag.data();
    //resultPointers[19] and [20] are set by the caller
    resultSizes[0] = mat->Nb * block_size;
    resultSizes[1] = colorRoundedValSize; // zeropadded valSize;
    resultSizes[2] = numColors;
    resultSizes[3] = worstCaseColumnAccessNum; //totalCols
    resultSizes[4] = NROffsetSize; //NRFlagSize
    resultSizes[5] = blockDiagSize; //diagValsSize
    resultSizes[6] = mat->Nb * block_size;
    resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize;
    resultSizes[8] = numColors;
    resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols
    resultSizes[10] = LNROffsetSize; //LNRFlagSize
    resultSizes[11] = blockDiagSize; //LDiagValsSize
    resultSizes[12] = mat->Nb * block_size;
    resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize;
    resultSizes[14] = numColors;
    resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols
    resultSizes[16] = UNROffsetSize; //UNRFlagSize
    resultSizes[17] = blockDiagSize; //UDiagValsSize
    return true;
} // end init()


template <unsigned int block_size>
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
{
    const unsigned int bs = block_size;
    Timer t_reorder;
    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());

    if (verbosity >= 3) {
        std::ostringstream out;
        out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s";
        OpmLog::info(out.str());
    }

    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
    Timer t_memcpy;
    memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);

    if (verbosity >= 3) {
        std::ostringstream out;
        out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s";
        OpmLog::info(out.str());
    }

    int i, j, ij, ik, jk;
    int iRowStart, iRowEnd, jRowEnd;
    double pivot[bs * bs];
    int LSize = 0;
    Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks

    Timer t_decomposition;

    // go through all rows
    for (i = 0; i < LUMat->Nb; i++) {
        iRowStart = LUMat->rowPointers[i];
        iRowEnd = LUMat->rowPointers[i + 1];

        // go through all elements of the row
        for (ij = iRowStart; ij < iRowEnd; ij++) {
            j = LUMat->colIndices[ij];
            // if the element is the diagonal, store the index and go to next row
            if (j == i) {
                diagIndex[i] = ij;
                break;
            }
            // if an element beyond the diagonal is reach, no diagonal was found
            // throw an error now. TODO: perform reordering earlier to prevent this
            if (j > i) {
                std::ostringstream out;
                out << "BILU0 Error could not find diagonal value in row: " << i;
                OpmLog::error(out.str());
                return false;
            }

            LSize++;
            // calculate the pivot of this row
            blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);

            memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);

            jRowEnd = LUMat->rowPointers[j + 1];
            jk = diagIndex[j] + 1;
            ik = ij + 1;
            // substract that row scaled by the pivot from this row.
            while (ik < iRowEnd && jk < jRowEnd) {
                if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
                    blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
                    ik++;
                    jk++;
                } else {
                    if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
                    { ik++; }
                    else
                    { jk++; }
                }
            }
        }
        // store the inverse in the diagonal!
        inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
        memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
    }

    LMat->rowPointers[0] = 0;
    UMat->rowPointers[0] = 0;

    // Split the LU matrix into two by comparing column indices to diagonal indices
    for (i = 0; i < LUMat->Nb; i++) {
        LMat->rowPointers[i + 1] = LMat->rowPointers[i];
        for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
            if (j < diagIndex[i]) {
                memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
                LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j];
                LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1;
            }
        }
    }
    // Reverse the order or the (blocked) rows for the U matrix,
    // because the rows are accessed in reverse order when applying the ILU0
    int URowIndex = 0;
    for (i = LUMat->Nb - 1; i >= 0; i--) {
        UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex];
        for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
            if (j > diagIndex[i]) {
                memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
                UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j];
                UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1;
            }
        }
        URowIndex++;
    }

    if (verbosity >= 3) {
        std::ostringstream out;
        out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s";
        OpmLog::info(out.str());
    }

    std::vector<int> URowsPerColor(numColors);
    rowSize = block_size * rMat->Nb;
    LRowSize = block_size * LMat->Nb;
    URowSize = block_size * UMat->Nb;
    LNumColors = numColors;
    UNumColors = numColors;
    for (int c = 0; c < numColors; c++) {
        URowsPerColor[numColors - c - 1] = rowsPerColor[c];
    }
    int err;
    err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false,
                      colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(),
                      nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize);
    if (err != 0) {
        return false;
    }
    err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false,
                      LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(),
                      LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize);
    if (err != 0) {
        return false;
    }
    err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true,
                      UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(),
                      UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize);
    if (err != 0) {
        return false;
    }
    blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data());
    // resultPointers are set in the init method
    resultSizes[0] = rowSize;
    resultSizes[1] = colorSizes[3]; // zeropadded valSize;
    resultSizes[2] = numColors;
    resultSizes[3] = colorSizes[2]; //totalCols
    resultSizes[4] = colorSizes[5]; //NRFlagSize
    resultSizes[5] = colorSizes[6]; //diagValsSize
    resultSizes[6] = LRowSize;
    resultSizes[7] = LColorSizes[3]; // zeropadded LValSize;
    resultSizes[8] = LNumColors;
    resultSizes[9] = LColorSizes[2]; //LTotalCols
    resultSizes[10] = LColorSizes[5]; //LNRFlagSize
    resultSizes[11] = LColorSizes[6]; //LDiagValsSize
    resultSizes[12] = URowSize;
    resultSizes[13] = UColorSizes[3]; // zeropadded UValSize;
    resultSizes[14] = UNumColors;
    resultSizes[15] = UColorSizes[2]; //UTotalCols
    resultSizes[16] = UColorSizes[5]; //UNRFlagSize
    resultSizes[17] = UColorSizes[6]; //UDiagValsSize
    return true;
} // end create_preconditioner()


#define INSTANTIATE_BDA_FUNCTIONS(n)                                    \
template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int);  \
template FPGABILU0<n>::~FPGABILU0();                                    \
template bool FPGABILU0<n>::init(BlockedMatrix<n> *);                   \
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *);  \

INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);

#undef INSTANTIATE_BDA_FUNCTIONS

} //namespace bda