From 141af23db56b3b2d94eb0959eef77a262eb0a968 Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Tue, 23 Feb 2021 14:15:47 +0100 Subject: [PATCH] Exact ILU decomp is now performed on GPU --- opm/simulators/linalg/bda/BILU0.cpp | 261 ++++++++---------- opm/simulators/linalg/bda/BILU0.hpp | 48 +++- opm/simulators/linalg/bda/Reorder.cpp | 13 +- opm/simulators/linalg/bda/Reorder.hpp | 4 +- .../linalg/bda/WellContributions.cpp | 2 +- opm/simulators/linalg/bda/openclKernels.hpp | 158 ++++++++++- .../linalg/bda/openclSolverBackend.cpp | 9 +- .../linalg/bda/openclSolverBackend.hpp | 6 +- 8 files changed, 323 insertions(+), 178 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 991dc14b1..492cd6536 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -18,6 +18,7 @@ */ #include + #include #include #include @@ -32,14 +33,6 @@ namespace bda { -// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU -// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in: -// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 -// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU -// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU -#define CHOW_PATEL 0 -#define CHOW_PATEL_GPU 1 - using Opm::OpmLog; using Dune::Timer; @@ -126,41 +119,46 @@ namespace bda diagIndex = new int[mat->Nb]; invDiagVals = new double[mat->Nb * bs * bs]; +#if CHOW_PATEL Lmat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); Umat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); +#endif + + LUmat->nnzValues = new double[mat->nnzbs * bs * bs]; - s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); - s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs); - s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); - s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs); - s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); - s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1)); s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb); s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1)); + s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb); +#if CHOW_PATEL + s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); + s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); + s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); + s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); + s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); + s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); +#else + s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs); + s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs); + s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1)); +#endif - queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues); - queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues); - queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices); - queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices); - queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers); - queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers); queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals); - int *rowsPerColorPrefix = new int[numColors + 1]; - rowsPerColorPrefix[0] = 0; + rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0 for (int i = 0; i < numColors; ++i) { rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i]; } - queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix); - delete[] rowsPerColorPrefix; + queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data()); return true; } // end init() + // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015 template void BILU0::chow_patel_decomposition() { +#if CHOW_PATEL const unsigned int bs = block_size; int num_sweeps = 6; @@ -399,7 +397,7 @@ namespace bda // convert Ut to BSR // diagonal stored separately std::vector ptr(Nb+1, 0); - std::vector col(Ut->rowPointers[Nb]); + std::vector cols(Ut->rowPointers[Nb]); // count blocks per row for U (BSR) // store diagonal in invDiagVals @@ -423,7 +421,7 @@ namespace bda int j = Ut->colIndices[k]; if (j != i) { int head = ptr[j]++; - col[head] = i; + cols[head] = i; memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs); } } @@ -433,21 +431,39 @@ namespace bda std::rotate(ptr.begin(), ptr.end() - 1, ptr.end()); ptr.front() = 0; - // reversing the rows of U, because that is the order they are used in during ILU apply - int URowIndex = 0; - int offsetU = 0; // number of nnz blocks that are already copied to Umat - Umat->rowPointers[0] = 0; - for (int i = LUmat->Nb - 1; i >= 0; i--) { - int rowSize = ptr[i + 1] - ptr[i]; // number of blocks in this row - memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize); - memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize); - offsetU += rowSize; - Umat->rowPointers[URowIndex + 1] = offsetU; - URowIndex++; - } + std::call_once(pattern_uploaded, [&](){ + // find the positions of each diagonal block + // must be done after reordering + for (int row = 0; row < Nb; ++row) { + int rowStart = LUmat->rowPointers[row]; + int rowEnd = LUmat->rowPointers[row+1]; + bool diagFound = false; + for (int ij = rowStart; ij < rowEnd; ++ij) { + int col = LUmat->colIndices[ij]; + if (row == col) { + diagIndex[row] = ij; + diagFound = true; + break; + } + } + if (!diagFound) { + OPM_THROW(std::logic_error, "Error did not find diagonal block in reordered matrix"); + } + } + queue->enqueueWriteBuffer(s.diagIndex, CL_TRUE, 0, Nb * sizeof(int), diagIndex); + queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices); + queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers); + queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), cols.data()); + queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data()); + }); + + + queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues); + queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp); + queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals); delete[] Utmp; - +#endif // CHOW_PATEL } template @@ -479,118 +495,66 @@ namespace bda OpmLog::info(out.str()); } - Timer t_decomposition; #if CHOW_PATEL chow_patel_decomposition(); #else - int i, j, ij, ik, jk; - int iRowStart, iRowEnd, jRowEnd; - double pivot[bs * bs]; + Timer t_copyToGpu; - int LSize = 0; - Opm::Detail::Inverter inverter; // reuse inverter to invert blocks + queue->enqueueWriteBuffer(s.LUvals, CL_TRUE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues); - // go through all rows - for (i = 0; i < LUmat->Nb; i++) { - iRowStart = LUmat->rowPointers[i]; - iRowEnd = LUmat->rowPointers[i + 1]; - - // go through all elements of the row - for (ij = iRowStart; ij < iRowEnd; ij++) { - j = LUmat->colIndices[ij]; - // if the element is the diagonal, store the index and go to next row - if (j == i) { - diagIndex[i] = ij; - break; - } - // if an element beyond the diagonal is reach, no diagonal was found - // throw an error now. TODO: perform reordering earlier to prevent this - if (j > i) { - std::ostringstream out; - out << "BILU0 Error could not find diagonal value in row: " << i; - OpmLog::error(out.str()); - return false; - } - - LSize++; - // calculate the pivot of this row - blockMult(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]); - - memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size); - - jRowEnd = LUmat->rowPointers[j + 1]; - jk = diagIndex[j] + 1; - ik = ij + 1; - // substract that row scaled by the pivot from this row. - while (ik < iRowEnd && jk < jRowEnd) { - if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) { - blockMultSub(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs); - ik++; - jk++; - } else { - if (LUmat->colIndices[ik] < LUmat->colIndices[jk]) - { ik++; } - else - { jk++; } + std::call_once(pattern_uploaded, [&](){ + // find the positions of each diagonal block + // must be done after reordering + for (int row = 0; row < Nb; ++row) { + int rowStart = LUmat->rowPointers[row]; + int rowEnd = LUmat->rowPointers[row+1]; + bool diagFound = false; + for (int ij = rowStart; ij < rowEnd; ++ij) { + int col = LUmat->colIndices[ij]; + if (row == col) { + diagIndex[row] = ij; + diagFound = true; + break; } } + if (!diagFound) { + OPM_THROW(std::logic_error, "Error did not find diagonal block in reordered matrix"); + } } - // store the inverse in the diagonal! - inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs); + queue->enqueueWriteBuffer(s.diagIndex, CL_TRUE, 0, Nb * sizeof(int), diagIndex); + queue->enqueueWriteBuffer(s.LUcols, CL_TRUE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices); + queue->enqueueWriteBuffer(s.LUrows, CL_TRUE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers); + }); - memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs); - } - - Lmat->rowPointers[0] = 0; - Umat->rowPointers[0] = 0; - - // Split the LU matrix into two by comparing column indices to diagonal indices - for (i = 0; i < LUmat->Nb; i++) { - int offsetL = Lmat->rowPointers[i]; - int rowSize = diagIndex[i] - LUmat->rowPointers[i]; - int offsetLU = LUmat->rowPointers[i]; - memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize); - memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize); - offsetL += rowSize; - Lmat->rowPointers[i + 1] = offsetL; - } - // Reverse the order or the (blocked) rows for the U matrix, - // because the rows are accessed in reverse order when applying the ILU0 - int URowIndex = 0; - for (i = LUmat->Nb - 1; i >= 0; i--) { - int offsetU = Umat->rowPointers[URowIndex]; - int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1; - int offsetLU = diagIndex[i] + 1; - memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize); - memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize); - offsetU += rowSize; - Umat->rowPointers[URowIndex + 1] = offsetU; - URowIndex++; - } -#endif - if (verbosity >= 3) { - std::ostringstream out; - out << "BILU0 decomposition: " << t_decomposition.stop() << " s"; - OpmLog::info(out.str()); - } - - Timer t_copyToGpu; - if (pattern_uploaded == false) { - queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices); - queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices); - queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers); - queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers); - pattern_uploaded = true; - } - queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues); - queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues); - queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals); if (verbosity >= 3) { std::ostringstream out; out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s"; OpmLog::info(out.str()); } + Timer t_decomposition; + std::ostringstream out; + for (int color = 0; color < numColors; ++color) { + const unsigned int firstRow = rowsPerColorPrefix[color]; + const unsigned int lastRow = rowsPerColorPrefix[color+1]; + const unsigned int work_group_size2 = 128; + const unsigned int num_work_groups2 = 1024; + const unsigned int total_work_items2 = num_work_groups2 * work_group_size2; + const unsigned int num_hwarps_per_group = work_group_size2 / 16; + const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot + if (verbosity >= 4) { + out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; + } + cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2)); + event.wait(); + } + + if (verbosity >= 3) { + out << "BILU0 decomposition: " << t_decomposition.stop() << " s"; + OpmLog::info(out.str()); + } +#endif // CHOW_PATEL + return true; } // end create_preconditioner() @@ -604,16 +568,24 @@ namespace bda Timer t_apply; for(int color = 0; color < numColors; ++color){ - event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#if CHOW_PATEL + event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#else + event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#endif // event.wait(); } for(int color = numColors-1; color >= 0; --color){ - event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#if CHOW_PATEL + event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#else + event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#endif // event.wait(); } - if (verbosity >= 3) { + if (verbosity >= 4) { event.wait(); std::ostringstream out; out << "BILU0 apply: " << t_apply.stop() << " s"; @@ -638,11 +610,13 @@ namespace bda } template void BILU0::setKernels( - cl::make_kernel *ILU_apply1_, - cl::make_kernel *ILU_apply2_ + cl::make_kernel *ILU_apply1_, + cl::make_kernel *ILU_apply2_, + cl::make_kernel *ilu_decomp_k_ ){ this->ILU_apply1 = ILU_apply1_; this->ILU_apply2 = ILU_apply2_; + this->ilu_decomp_k = ilu_decomp_k_; } @@ -657,8 +631,9 @@ template void BILU0::setOpenCLContext(cl::Context*); template void BILU0::setOpenCLQueue(cl::CommandQueue*); \ template void BILU0::setKernelParameters(unsigned int, unsigned int, unsigned int); \ template void BILU0::setKernels( \ - cl::make_kernel *, \ - cl::make_kernel * \ + cl::make_kernel *, \ + cl::make_kernel *, \ + cl::make_kernel * \ ); INSTANTIATE_BDA_FUNCTIONS(1); diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index 67d945851..0247c3934 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -20,12 +20,26 @@ #ifndef BILU0_HPP #define BILU0_HPP +#include + #include #include #include #include +// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU +// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in: +// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 +// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU +// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU +// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U +// the exact decomposition uses a full matrix LU which is the superposition of L and U +// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split +#define CHOW_PATEL 0 +#define CHOW_PATEL_GPU 1 + + namespace bda { @@ -40,33 +54,46 @@ namespace bda int Nb; // number of blockrows of the matrix int nnz; // number of nonzeroes of the matrix (scalar) int nnzbs; // number of blocks of the matrix - std::unique_ptr > Lmat = nullptr, Umat = nullptr, LUmat = nullptr; + std::unique_ptr > LUmat = nullptr; std::shared_ptr > rmat = nullptr; // only used with PAR_SIM +#if CHOW_PATEL + std::unique_ptr > Lmat = nullptr, Umat = nullptr; +#endif double *invDiagVals = nullptr; int *diagIndex = nullptr; - std::vector rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel + std::vector rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel + std::vector rowsPerColorPrefix; // the prefix sum of rowsPerColor int *toOrder = nullptr, *fromOrder = nullptr; int numColors; int verbosity; + std::once_flag pattern_uploaded; ILUReorder opencl_ilu_reorder; typedef struct { - cl::Buffer Lvals, Uvals, invDiagVals; - cl::Buffer Lcols, Lrows; - cl::Buffer Ucols, Urows; + cl::Buffer invDiagVals; + cl::Buffer diagIndex; cl::Buffer rowsPerColor; +#if CHOW_PATEL + cl::Buffer Lvals, Lcols, Lrows; + cl::Buffer Uvals, Ucols, Urows; +#else + cl::Buffer LUvals, LUcols, LUrows; +#endif } GPU_storage; - cl::make_kernel *ILU_apply1; - cl::make_kernel *ILU_apply2; + cl::make_kernel *ILU_apply1; + cl::make_kernel *ILU_apply2; + cl::make_kernel *ilu_decomp_k; + GPU_storage s; cl::Context *context; cl::CommandQueue *queue; int work_group_size = 0; int total_work_items = 0; int lmem_per_work_group = 0; - bool pattern_uploaded = false; ChowPatelIlu chowPatelIlu; @@ -91,8 +118,9 @@ namespace bda void setOpenCLQueue(cl::CommandQueue *queue); void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group); void setKernels( - cl::make_kernel *ILU_apply1, - cl::make_kernel *ILU_apply2 + cl::make_kernel *ILU_apply1, + cl::make_kernel *ILU_apply2, + cl::make_kernel *ilu_decomp_k ); int* getToOrder() diff --git a/opm/simulators/linalg/bda/Reorder.cpp b/opm/simulators/linalg/bda/Reorder.cpp index b7eebee57..cae3b35f9 100644 --- a/opm/simulators/linalg/bda/Reorder.cpp +++ b/opm/simulators/linalg/bda/Reorder.cpp @@ -201,11 +201,10 @@ void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, void colorsToReordering(int Nb, std::vector& colors, int numColors, int *toOrder, int *fromOrder, std::vector& rowsPerColor) { int reordered = 0; - int i, c; // Find reordering patterns - for (c = 0; c < numColors; c++) { - for (i = 0; i < Nb; i++) { + for (int c = 0; c < numColors; c++) { + for (int i = 0; i < Nb; i++) { if (colors[i] == c) { rowsPerColor[c]++; toOrder[i] = reordered; @@ -259,10 +258,11 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd int activeRowIndex = 0, colorEnd, nextActiveRowIndex = 0; int thisRow; std::vector doneRows(Nb, false); - rowsPerColor.reserve(Nb); - std::vector rowsToStart; + // since emplace_back() is used to fill, the vector must be empty + assert(rowsPerColor.size() == 0); + // find starting rows: rows that are independent from all rows that come before them. for (thisRow = 0; thisRow < Nb; thisRow++) { if (canBeStarted(thisRow, CSCColPointers, CSCRowIndices, doneRows)) { @@ -312,8 +312,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd template void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector& rowsPerColor) { - std::vector rowColor; - rowColor.resize(Nb); + std::vector rowColor(Nb); *numColors = colorBlockedNodes(Nb, CSRRowPointers, CSRColIndices, CSCColPointers, CSCRowIndices, rowColor, maxRowsPerColor, maxColsPerColor); diff --git a/opm/simulators/linalg/bda/Reorder.hpp b/opm/simulators/linalg/bda/Reorder.hpp index 7ae2c90db..08c8716ff 100644 --- a/opm/simulators/linalg/bda/Reorder.hpp +++ b/opm/simulators/linalg/bda/Reorder.hpp @@ -90,7 +90,7 @@ bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIn /// \param[out] numColors a pointer to the number of colors needed for the level scheduling /// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved /// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved -/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color +/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector& rowsPerColor); /// Find a graph coloring reordering for an input matrix @@ -105,7 +105,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd /// \param[out] numColors the number of colors used in the found graph coloring /// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved /// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved -/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color +/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function will resize() template void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector& rowsPerColor); diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index f17c53c3b..ada2a2723 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -22,7 +22,7 @@ #include #include #include -#include + #include namespace Opm diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index e3ffadf60..3d86c3060 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -225,7 +225,7 @@ namespace bda __global const double *Lvals, __global const unsigned int *Lcols, __global const unsigned int *Lrows, - const unsigned int LrowSize, + __global const int *diagIndex, __global const double *y, __global double *x, __global const unsigned int *nodesPerColorPrefix, @@ -250,7 +250,7 @@ namespace bda while(target_block_row < nodesPerColorPrefix[color+1]){ const unsigned int first_block = Lrows[target_block_row]; - const unsigned int last_block = Lrows[target_block_row+1]; + const unsigned int last_block = diagIndex[target_block_row]; unsigned int block = first_block + lane / (bs*bs); double local_out = 0.0; if(lane < num_active_threads){ @@ -292,10 +292,10 @@ namespace bda // solves U*x=y where L is a lower triangular sparse blocked matrix inline const char* ILU_apply2_s = R"( __kernel void ILU_apply2( - __global const double *Uvals, - __global const int *Ucols, - __global const int *Urows, - const unsigned int UrowSize, + __global const double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global const int *diagIndex, __global const double *invDiagVals, __global double *x, __global const unsigned int *nodesPerColorPrefix, @@ -320,8 +320,8 @@ namespace bda const double relaxation = 0.9; while(target_block_row < nodesPerColorPrefix[color+1]){ - const unsigned int first_block = Urows[UrowSize-target_block_row-1]; - const unsigned int last_block = Urows[UrowSize-target_block_row]; + const unsigned int first_block = diagIndex[target_block_row] + 1; + const unsigned int last_block = LUrows[target_block_row+1]; unsigned int block = first_block + lane / (bs*bs); double local_out = 0.0; if(lane < num_active_threads){ @@ -330,8 +330,8 @@ namespace bda local_out = x[row]; } for(; block < last_block; block += num_blocks_per_warp){ - const double x_elem = x[Ucols[block]*bs + c]; - const double A_elem = Uvals[block*bs*bs + c + r*bs]; + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; local_out -= x_elem * A_elem; } } @@ -513,6 +513,144 @@ namespace bda } )"; + inline const char* ilu_decomp_s = R"( + + // a = a - (b * c) + __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c) + { + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += b[block_size * row + k] * c[block_size * k + col]; + } + a[block_size * row + col] -= temp; + } + } + + // c = a * b + __kernel void block_mult(__global double *a, __global double *b, __local double *c) { + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += a[block_size * row + k] * b[block_size * k + col]; + } + c[block_size * row + col] = temp; + } + } + + // invert 3x3 matrix + __kernel void inverter(__global double *matrix, __global double *inverse) { + const unsigned int block_size = 3; + const unsigned int bs = block_size; // rename to shorter name + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < bs * bs){ + double t4 = matrix[0] * matrix[4]; + double t6 = matrix[0] * matrix[5]; + double t8 = matrix[1] * matrix[3]; + double t10 = matrix[2] * matrix[3]; + double t12 = matrix[1] * matrix[6]; + double t14 = matrix[2] * matrix[6]; + + double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + + t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); + double t17 = 1.0 / det; + + const unsigned int r = thread_id_in_hwarp / bs; + const unsigned int c = thread_id_in_hwarp % bs; + const unsigned int r1 = (r+1) % bs; + const unsigned int c1 = (c+1) % bs; + const unsigned int r2 = (r+bs-1) % bs; + const unsigned int c2 = (c+bs-1) % bs; + inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17; + } + } + + __kernel void ilu_decomp(const unsigned int firstRow, + const unsigned int lastRow, + __global double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global double *invDiagVals, + __global int *diagIndex, + const unsigned int Nb, + __local double *pivot){ + + const unsigned int bs = 3; + const unsigned int hwarp_size = 16; + const unsigned int work_group_size = get_local_size(0); + const unsigned int work_group_id = get_group_id(0); + const unsigned int num_groups = get_num_groups(0); + const unsigned int hwarps_per_group = work_group_size / hwarp_size; + const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads) + const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size; + const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it + // every workitem in a hwarp has the same lmem_offset + + // go through all rows + for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) + { + int iRowStart = LUrows[i]; + int iRowEnd = LUrows[i + 1]; + + // go through all elements of the row + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUcols[ij]; + + if (j < i) { + // calculate the pivot of this row + block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); + + // copy pivot + if (thread_id_in_hwarp < bs * bs) { + LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp]; + } + + int jRowEnd = LUrows[j + 1]; + int jk = diagIndex[j] + 1; + int ik = ij + 1; + // substract that row scaled by the pivot from this row. + while (ik < iRowEnd && jk < jRowEnd) { + if (LUcols[ik] == LUcols[jk]) { + block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs); + ik++; + jk++; + } else { + if (LUcols[ik] < LUcols[jk]) + { ik++; } + else + { jk++; } + } + } + } + } + + // store the inverse in the diagonal + inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); + + // copy inverse + if (thread_id_in_hwarp < bs * bs) { + LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; + } + + } + } + )"; + } // end namespace bda #endif diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 43ba050e2..6c8b0d979 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -484,6 +484,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s))); source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s))); source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s))); + source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s))); program = cl::Program(*context, source); program.build(devices); @@ -527,8 +528,8 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub axpy_k.reset(new cl::make_kernel(cl::Kernel(program, "axpy"))); custom_k.reset(new cl::make_kernel(cl::Kernel(program, "custom"))); spmv_blocked_k.reset(new cl::make_kernel(cl::Kernel(program, "spmv_blocked"))); - ILU_apply1_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply1"))); - ILU_apply2_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply2"))); + ILU_apply1_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply1"))); + ILU_apply2_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply2"))); stdwell_apply_k.reset(new cl::make_kernel::initialize(int N_, int nnz_, int dim, doub cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder"))); + ilu_decomp_k.reset(new cl::make_kernel(cl::Kernel(program, "ilu_decomp"))); - prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); + prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get()); } catch (const cl::Error& error) { std::ostringstream oss; diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index aae527643..3e9592ef2 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -70,8 +70,8 @@ private: std::unique_ptr > axpy_k; std::unique_ptr > custom_k; std::unique_ptr > spmv_blocked_k; - std::shared_ptr > ILU_apply1_k; - std::shared_ptr > ILU_apply2_k; + std::shared_ptr > ILU_apply1_k; + std::shared_ptr > ILU_apply2_k; std::shared_ptr > stdwell_apply_no_reorder_k; + std::shared_ptr > ilu_decomp_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings