/* Copyright 2019 Equinor ASA This file is part of the Open Porous Media project (OPM). OPM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OPM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OPM. If not, see . */ #include #include #include #include #include #include #include #include #include namespace bda { using Opm::OpmLog; BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) : level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_) { if (level_scheduling == graph_coloring) { OPM_THROW(std::logic_error, "Error, either level_scheduling or graph_coloring must be true, not both\n"); } } BILU0::~BILU0() { delete[] invDiagVals; delete[] diagIndex; delete[] rowsPerColor; freeBlockedMatrix(&LUMat); freeBlockedMatrix(&LMat); freeBlockedMatrix(&UMat); delete[] toOrder; delete[] fromOrder; freeBlockedMatrix(&rMat); } bool BILU0::init(BlockedMatrix *mat, unsigned int block_size) { double t1 = 0.0, t2 = 0.0; BlockedMatrix *CSCmat = nullptr; this->N = mat->Nb * block_size; this->Nb = mat->Nb; this->nnz = mat->nnzbs * block_size * block_size; this->nnzbs = mat->nnzbs; this->block_size = block_size; toOrder = new int[N]; fromOrder = new int[N]; if (level_scheduling) { CSCmat = new BlockedMatrix(); CSCmat->Nb = Nb; CSCmat->nnzbs = nnzbs; CSCmat->nnzValues = new Block[nnzbs]; CSCmat->colIndices = new int[nnzbs]; CSCmat->rowPointers = new int[Nb + 1]; if(verbosity >= 3){ t1 = BdaSolver::second(); } bcsr_to_bcsc(mat->nnzValues, mat->colIndices, mat->rowPointers, CSCmat->nnzValues, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb); if(verbosity >= 3){ t2 = BdaSolver::second(); std::ostringstream out; out << "BILU0 convert CSR to CSC: " << t2 - t1 << " s"; OpmLog::info(out.str()); } } if(verbosity >= 3){ t1 = BdaSolver::second(); } rMat = allocateBlockedMatrix(mat->Nb, mat->nnzbs); LUMat = soft_copyBlockedMatrix(rMat); if (level_scheduling) { rowsPerColor = findLevelScheduling(mat->colIndices, mat->rowPointers, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb, &numColors, toOrder, fromOrder); } else if (graph_coloring) { rowsPerColor = findGraphColoring(mat->colIndices, mat->rowPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder); } if (rowsPerColor == nullptr) { return false; } if(verbosity >= 3){ t2 = BdaSolver::second(); std::ostringstream out; out << "BILU0 analysis took: " << t2 - t1 << " s, " << numColors << " colors"; OpmLog::info(out.str()); } diagIndex = new int[mat->Nb]; invDiagVals = new Block[mat->Nb]; LMat = allocateBlockedMatrix(mat->Nb, (mat->nnzbs - mat->Nb) / 2); UMat = allocateBlockedMatrix(mat->Nb, (mat->nnzbs - mat->Nb) / 2); LUMat->nnzValues = new Block[mat->nnzbs]; if (level_scheduling) { delete[] CSCmat->nnzValues; delete[] CSCmat->colIndices; delete[] CSCmat->rowPointers; delete CSCmat; } s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * LMat->nnzbs); s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * UMat->nnzbs); s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LMat->nnzbs); s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * UMat->nnzbs); s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LMat->Nb + 1)); s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (UMat->Nb + 1)); s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * mat->Nb); s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1)); queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, LMat->nnzbs * sizeof(Block), LMat->nnzValues); queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(Block), UMat->nnzValues); queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices); queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices); queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (LMat->Nb + 1) * sizeof(int), LMat->rowPointers); queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (UMat->Nb + 1) * sizeof(int), UMat->rowPointers); queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(Block), invDiagVals); int *rowsPerColorPrefix = new int[numColors + 1]; int prefix = 0; for (int i = 0; i < numColors + 1; ++i) { rowsPerColorPrefix[i] = prefix; prefix += rowsPerColor[i]; } queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix); delete[] rowsPerColorPrefix; return true; } // end init() bool BILU0::create_preconditioner(BlockedMatrix *mat) { double t1 = 0.0, t2 = 0.0; if (verbosity >= 3){ t1 = BdaSolver::second(); } blocked_reorder_matrix_by_pattern(mat, toOrder, fromOrder, rMat); if (verbosity >= 3){ t2 = BdaSolver::second(); std::ostringstream out; out << "BILU0 reorder matrix: " << t2 - t1 << " s"; OpmLog::info(out.str()); } // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp if (verbosity >= 3){ t1 = BdaSolver::second(); } memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(Block) * rMat->nnzbs); if (verbosity >= 3){ t2 = BdaSolver::second(); std::ostringstream out; out << "BILU0 memcpy: " << t2 - t1 << " s"; OpmLog::info(out.str()); } int i, j, ij, ik, jk; int iRowStart, iRowEnd, jRowEnd; Block pivot; int LSize = 0; const int blockSquare = block_size * block_size; if (verbosity >= 3){ t1 = BdaSolver::second(); } // go through all rows for (i = 0; i < LUMat->Nb; i++) { iRowStart = LUMat->rowPointers[i]; iRowEnd = LUMat->rowPointers[i + 1]; // go through all elements of the row for (ij = iRowStart; ij < iRowEnd; ij++) { j = LUMat->colIndices[ij]; // if the element is the diagonal, store the index and go to next row if (j == i) { diagIndex[i] = ij; break; } // if an element beyond the diagonal is reach, no diagonal was found // throw an error now. TODO: perform reordering earlier to prevent this if (j > i) { std::ostringstream out; out << "BILU0 Error could not find diagonal value in row: " << i; OpmLog::error(out.str()); return false; } LSize++; // calculate the pivot of this row blockMult(LUMat->nnzValues[ij], invDiagVals[j], pivot); memcpy(LUMat->nnzValues[ij], &pivot, sizeof(double)*block_size * block_size); jRowEnd = LUMat->rowPointers[j + 1]; jk = diagIndex[j] + 1; ik = ij + 1; // substract that row scaled by the pivot from this row. while (ik < iRowEnd && jk < jRowEnd) { if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) { blockMultSub(LUMat->nnzValues[ik], pivot, LUMat->nnzValues[jk]); ik++; jk++; } else { if (LUMat->colIndices[ik] < LUMat->colIndices[jk]) { ik++; } else { jk++; } } } } // store the inverse in the diagonal! blockInvert3x3(LUMat->nnzValues[ij], invDiagVals[i]); memcpy(LUMat->nnzValues[ij], invDiagVals[i], sizeof(double)*block_size * block_size); } LMat->rowPointers[0] = 0; UMat->rowPointers[0] = 0; // Split the LU matrix into two by comparing column indices to diagonal indices for (i = 0; i < LUMat->Nb; i++) { int offsetL = LMat->rowPointers[i]; int rowSize = diagIndex[i] - LUMat->rowPointers[i]; int offsetLU = LUMat->rowPointers[i]; memcpy(LMat->nnzValues[offsetL], LUMat->nnzValues[offsetLU], sizeof(double) * blockSquare * rowSize); memcpy(LMat->colIndices + offsetL, LUMat->colIndices + offsetLU, sizeof(int) * rowSize); offsetL += rowSize; LMat->rowPointers[i + 1] = offsetL; } // Reverse the order or the (blocked) rows for the U matrix, // because the rows are accessed in reverse order when applying the ILU0 int URowIndex = 0; for (i = LUMat->Nb - 1; i >= 0; i--) { int offsetU = UMat->rowPointers[URowIndex]; int rowSize = LUMat->rowPointers[i + 1] - diagIndex[i] - 1; int offsetLU = diagIndex[i] + 1; memcpy(UMat->nnzValues[offsetU], LUMat->nnzValues[offsetLU], sizeof(double) * blockSquare * rowSize); memcpy(UMat->colIndices + offsetU, LUMat->colIndices + offsetLU, sizeof(int) * rowSize); offsetU += rowSize; UMat->rowPointers[URowIndex + 1] = offsetU; URowIndex++; } if (verbosity >= 3) { t2 = BdaSolver::second(); std::ostringstream out; out << "BILU0 decomposition: " << t2 - t1 << " s"; OpmLog::info(out.str()); } if (verbosity >= 3) { t1 = BdaSolver::second(); } if (pattern_uploaded == false) { queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices); queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices); queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (LMat->Nb + 1) * sizeof(int), LMat->rowPointers); queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (UMat->Nb + 1) * sizeof(int), UMat->rowPointers); pattern_uploaded = true; } queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, LMat->nnzbs * sizeof(Block), LMat->nnzValues); queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(Block), UMat->nnzValues); queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(Block), invDiagVals); if (verbosity >= 3) { t2 = BdaSolver::second(); std::ostringstream out; out << "BILU0 copy to GPU: " << t2 - t1 << " s"; OpmLog::info(out.str()); } return true; } // end create_preconditioner() // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed // however, if individual kernel calls are timed, waiting for events is needed // behavior on other GPUs is untested void BILU0::apply(cl::Buffer& x, cl::Buffer& y) { double t1 = 0.0, t2 = 0.0; if (verbosity >= 3) { t1 = BdaSolver::second(); } const unsigned int block_size = 3; cl::Event event; for(unsigned int color = 0; color < numColors; ++color){ event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); // event.wait(); } for(int color = numColors-1; color >= 0; --color){ event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); // event.wait(); } if (verbosity >= 3) { event.wait(); t2 = BdaSolver::second(); std::ostringstream out; out << "BILU0 apply: " << t2 - t1 << " s"; OpmLog::info(out.str()); } } void BILU0::setOpenCLContext(cl::Context *context){ this->context = context; } void BILU0::setOpenCLQueue(cl::CommandQueue *queue){ this->queue = queue; } void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){ this->work_group_size = work_group_size; this->total_work_items = total_work_items; this->lmem_per_work_group = lmem_per_work_group; } void BILU0::setKernels( cl::make_kernel *ILU_apply1, cl::make_kernel *ILU_apply2 ){ this->ILU_apply1 = ILU_apply1; this->ILU_apply2 = ILU_apply2; } } // end namespace bda