diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 5259d65b4..500c31102 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -60,6 +60,7 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index 4d4c19b9b..eb83b7d3f 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -250,12 +250,9 @@ namespace Opm if (use_gpu) { const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); WellContributions wellContribs(gpu_mode); -#if HAVE_OPENCL - if(gpu_mode.compare("opencl") == 0){ - const auto openclBackend = static_cast*>(&bdaBridge->getBackend()); - wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get()); - } -#endif + + bdaBridge->initWellContributions(wellContribs); + if (!useWellConn_) { simulator_.problem().wellModel().getWellContributions(wellContribs); } diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 991dc14b1..6dc037468 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -18,6 +18,7 @@ */ #include + #include #include #include @@ -32,14 +33,6 @@ namespace bda { -// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU -// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in: -// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 -// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU -// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU -#define CHOW_PATEL 0 -#define CHOW_PATEL_GPU 1 - using Opm::OpmLog; using Dune::Timer; @@ -126,41 +119,54 @@ namespace bda diagIndex = new int[mat->Nb]; invDiagVals = new double[mat->Nb * bs * bs]; +#if CHOW_PATEL Lmat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); Umat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); +#endif + + LUmat->nnzValues = new double[mat->nnzbs * bs * bs]; - s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); - s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs); - s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); - s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Umat->nnzbs); - s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); - s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Umat->Nb + 1)); s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb); s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1)); + s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb); +#if CHOW_PATEL + s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); + s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); + s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); + s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); + s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); + s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); +#else + s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs); + s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs); + s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1)); +#endif - queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues); - queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues); - queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices); - queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices); - queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers); - queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers); - queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals); + events.resize(2); + err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]); - int *rowsPerColorPrefix = new int[numColors + 1]; - rowsPerColorPrefix[0] = 0; + rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0 for (int i = 0; i < numColors; ++i) { rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i]; } - queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix); - delete[] rowsPerColorPrefix; + err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]); + + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error"); + } return true; } // end init() + // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015 template void BILU0::chow_patel_decomposition() { +#if CHOW_PATEL const unsigned int bs = block_size; int num_sweeps = 6; @@ -196,6 +202,8 @@ namespace bda } #endif + Timer t_total, t_preprocessing; + // Ut is actually BSC format std::unique_ptr > Ut = std::make_unique >(Nb, (nnzbs + Nb) / 2); @@ -274,7 +282,14 @@ namespace bda // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp double *Utmp = new double[Ut->nnzbs * block_size * block_size]; + if (verbosity >= 3) { + std::ostringstream out; + out << "BILU0 ChowPatel preprocessing: " << t_preprocessing.stop() << " s"; + OpmLog::info(out.str()); + } + // actual ILU decomposition + Timer t_decomposition; #if CHOW_PATEL_GPU chowPatelIlu.decomposition(queue, context, Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs, @@ -396,10 +411,18 @@ namespace bda delete[] Ltmp; #endif + if (verbosity >= 3){ + std::ostringstream out; + out << "BILU0 ChowPatel decomposition: " << t_decomposition.stop() << " s"; + OpmLog::info(out.str()); + } + + Timer t_postprocessing; + // convert Ut to BSR // diagonal stored separately std::vector ptr(Nb+1, 0); - std::vector col(Ut->rowPointers[Nb]); + std::vector cols(Ut->rowPointers[Nb]); // count blocks per row for U (BSR) // store diagonal in invDiagVals @@ -423,7 +446,7 @@ namespace bda int j = Ut->colIndices[k]; if (j != i) { int head = ptr[j]++; - col[head] = i; + cols[head] = i; memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs); } } @@ -433,21 +456,55 @@ namespace bda std::rotate(ptr.begin(), ptr.end() - 1, ptr.end()); ptr.front() = 0; - // reversing the rows of U, because that is the order they are used in during ILU apply - int URowIndex = 0; - int offsetU = 0; // number of nnz blocks that are already copied to Umat - Umat->rowPointers[0] = 0; - for (int i = LUmat->Nb - 1; i >= 0; i--) { - int rowSize = ptr[i + 1] - ptr[i]; // number of blocks in this row - memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize); - memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize); - offsetU += rowSize; - Umat->rowPointers[URowIndex + 1] = offsetU; - URowIndex++; + + if (verbosity >= 3){ + std::ostringstream out; + out << "BILU0 ChowPatel postprocessing: " << t_postprocessing.stop() << " s"; + OpmLog::info(out.str()); + } + + Timer t_copyToGpu; + + events.resize(3); + queue->enqueueWriteBuffer(s.Lvals, CL_FALSE, 0, Lmat->nnzbs * bs * bs * sizeof(double), Lmat->nnzValues, nullptr, &events[0]); + queue->enqueueWriteBuffer(s.Uvals, CL_FALSE, 0, Umat->nnzbs * bs * bs * sizeof(double), Utmp, nullptr, &events[1]); + queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, LUmat->Nb * bs * bs * sizeof(double), invDiagVals, nullptr, &events[2]); + + std::call_once(pattern_uploaded, [&](){ + // find the positions of each diagonal block + // must be done after reordering + for (int row = 0; row < Nb; ++row) { + int rowStart = LUmat->rowPointers[row]; + int rowEnd = LUmat->rowPointers[row+1]; + + auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row); + assert(candidate != LUmat->colIndices + rowEnd); + diagIndex[row] = candidate - LUmat->colIndices; + } + events.resize(8); + queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]); + queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]); + queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]); + queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]); + queue->enqueueWriteBuffer(s.Urows, CL_FALSE, 0, (Umat->Nb + 1) * sizeof(int), ptr.data(), nullptr, &events[7]); + }); + + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error"); + } + + if (verbosity >= 3){ + std::ostringstream out; + out << "BILU0 ChowPatel copy to GPU: " << t_copyToGpu.stop() << " s\n"; + out << "BILU0 ChowPatel total: " << t_total.stop() << " s"; + OpmLog::info(out.str()); } delete[] Utmp; - +#endif // CHOW_PATEL } template @@ -479,118 +536,68 @@ namespace bda OpmLog::info(out.str()); } - Timer t_decomposition; #if CHOW_PATEL chow_patel_decomposition(); #else - int i, j, ij, ik, jk; - int iRowStart, iRowEnd, jRowEnd; - double pivot[bs * bs]; - - int LSize = 0; - Opm::Detail::Inverter inverter; // reuse inverter to invert blocks - - // go through all rows - for (i = 0; i < LUmat->Nb; i++) { - iRowStart = LUmat->rowPointers[i]; - iRowEnd = LUmat->rowPointers[i + 1]; - - // go through all elements of the row - for (ij = iRowStart; ij < iRowEnd; ij++) { - j = LUmat->colIndices[ij]; - // if the element is the diagonal, store the index and go to next row - if (j == i) { - diagIndex[i] = ij; - break; - } - // if an element beyond the diagonal is reach, no diagonal was found - // throw an error now. TODO: perform reordering earlier to prevent this - if (j > i) { - std::ostringstream out; - out << "BILU0 Error could not find diagonal value in row: " << i; - OpmLog::error(out.str()); - return false; - } - - LSize++; - // calculate the pivot of this row - blockMult(LUmat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]); - - memcpy(LUmat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * block_size * block_size); - - jRowEnd = LUmat->rowPointers[j + 1]; - jk = diagIndex[j] + 1; - ik = ij + 1; - // substract that row scaled by the pivot from this row. - while (ik < iRowEnd && jk < jRowEnd) { - if (LUmat->colIndices[ik] == LUmat->colIndices[jk]) { - blockMultSub(LUmat->nnzValues + ik * bs * bs, pivot, LUmat->nnzValues + jk * bs * bs); - ik++; - jk++; - } else { - if (LUmat->colIndices[ik] < LUmat->colIndices[jk]) - { ik++; } - else - { jk++; } - } - } - } - // store the inverse in the diagonal! - inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs); - - memcpy(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs); - } - - Lmat->rowPointers[0] = 0; - Umat->rowPointers[0] = 0; - - // Split the LU matrix into two by comparing column indices to diagonal indices - for (i = 0; i < LUmat->Nb; i++) { - int offsetL = Lmat->rowPointers[i]; - int rowSize = diagIndex[i] - LUmat->rowPointers[i]; - int offsetLU = LUmat->rowPointers[i]; - memcpy(Lmat->nnzValues + offsetL * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize); - memcpy(Lmat->colIndices + offsetL, LUmat->colIndices + offsetLU, sizeof(int) * rowSize); - offsetL += rowSize; - Lmat->rowPointers[i + 1] = offsetL; - } - // Reverse the order or the (blocked) rows for the U matrix, - // because the rows are accessed in reverse order when applying the ILU0 - int URowIndex = 0; - for (i = LUmat->Nb - 1; i >= 0; i--) { - int offsetU = Umat->rowPointers[URowIndex]; - int rowSize = LUmat->rowPointers[i + 1] - diagIndex[i] - 1; - int offsetLU = diagIndex[i] + 1; - memcpy(Umat->nnzValues + offsetU * bs * bs, LUmat->nnzValues + offsetLU * bs * bs, sizeof(double) * bs * bs * rowSize); - memcpy(Umat->colIndices + offsetU, LUmat->colIndices + offsetLU, sizeof(int) * rowSize); - offsetU += rowSize; - Umat->rowPointers[URowIndex + 1] = offsetU; - URowIndex++; - } -#endif - if (verbosity >= 3) { - std::ostringstream out; - out << "BILU0 decomposition: " << t_decomposition.stop() << " s"; - OpmLog::info(out.str()); - } - Timer t_copyToGpu; - if (pattern_uploaded == false) { - queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices); - queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, Umat->nnzbs * sizeof(int), Umat->colIndices); - queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers); - queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (Umat->Nb + 1) * sizeof(int), Umat->rowPointers); - pattern_uploaded = true; + + events.resize(1); + queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]); + + std::call_once(pattern_uploaded, [&](){ + // find the positions of each diagonal block + // must be done after reordering + for (int row = 0; row < Nb; ++row) { + int rowStart = LUmat->rowPointers[row]; + int rowEnd = LUmat->rowPointers[row+1]; + + auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row); + assert(candidate != LUmat->colIndices + rowEnd); + diagIndex[row] = candidate - LUmat->colIndices; + } + events.resize(4); + queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]); + queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); + queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); + }); + + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error"); } - queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, Lmat->nnzbs * sizeof(double) * bs * bs, Lmat->nnzValues); - queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, Umat->nnzbs * sizeof(double) * bs * bs, Umat->nnzValues); - queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(double) * bs * bs, invDiagVals); + if (verbosity >= 3) { std::ostringstream out; out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s"; OpmLog::info(out.str()); } + Timer t_decomposition; + std::ostringstream out; + cl::Event event; + for (int color = 0; color < numColors; ++color) { + const unsigned int firstRow = rowsPerColorPrefix[color]; + const unsigned int lastRow = rowsPerColorPrefix[color+1]; + const unsigned int work_group_size2 = 128; + const unsigned int num_work_groups2 = 1024; + const unsigned int total_work_items2 = num_work_groups2 * work_group_size2; + const unsigned int num_hwarps_per_group = work_group_size2 / 16; + const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot + if (verbosity >= 4) { + out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; + } + event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2)); + event.wait(); + } + + if (verbosity >= 3) { + out << "BILU0 decomposition: " << t_decomposition.stop() << " s"; + OpmLog::info(out.str()); + } +#endif // CHOW_PATEL + return true; } // end create_preconditioner() @@ -604,16 +611,24 @@ namespace bda Timer t_apply; for(int color = 0; color < numColors; ++color){ - event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#if CHOW_PATEL + event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#else + event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#endif // event.wait(); } for(int color = numColors-1; color >= 0; --color){ - event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#if CHOW_PATEL + event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#else + event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); +#endif // event.wait(); } - if (verbosity >= 3) { + if (verbosity >= 4) { event.wait(); std::ostringstream out; out << "BILU0 apply: " << t_apply.stop() << " s"; @@ -638,11 +653,13 @@ namespace bda } template void BILU0::setKernels( - cl::make_kernel *ILU_apply1_, - cl::make_kernel *ILU_apply2_ + cl::make_kernel *ILU_apply1_, + cl::make_kernel *ILU_apply2_, + cl::make_kernel *ilu_decomp_k_ ){ this->ILU_apply1 = ILU_apply1_; this->ILU_apply2 = ILU_apply2_; + this->ilu_decomp_k = ilu_decomp_k_; } @@ -657,8 +674,9 @@ template void BILU0::setOpenCLContext(cl::Context*); template void BILU0::setOpenCLQueue(cl::CommandQueue*); \ template void BILU0::setKernelParameters(unsigned int, unsigned int, unsigned int); \ template void BILU0::setKernels( \ - cl::make_kernel *, \ - cl::make_kernel * \ + cl::make_kernel *, \ + cl::make_kernel *, \ + cl::make_kernel * \ ); INSTANTIATE_BDA_FUNCTIONS(1); diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index 67d945851..aeca4817e 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -20,12 +20,27 @@ #ifndef BILU0_HPP #define BILU0_HPP +#include + #include #include #include +#include #include +// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU +// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in: +// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 +// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU +// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU +// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U +// the exact decomposition uses a full matrix LU which is the superposition of L and U +// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split +#define CHOW_PATEL 0 +#define CHOW_PATEL_GPU 1 + + namespace bda { @@ -40,33 +55,48 @@ namespace bda int Nb; // number of blockrows of the matrix int nnz; // number of nonzeroes of the matrix (scalar) int nnzbs; // number of blocks of the matrix - std::unique_ptr > Lmat = nullptr, Umat = nullptr, LUmat = nullptr; + std::unique_ptr > LUmat = nullptr; std::shared_ptr > rmat = nullptr; // only used with PAR_SIM +#if CHOW_PATEL + std::unique_ptr > Lmat = nullptr, Umat = nullptr; +#endif double *invDiagVals = nullptr; int *diagIndex = nullptr; - std::vector rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel + std::vector rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel + std::vector rowsPerColorPrefix; // the prefix sum of rowsPerColor int *toOrder = nullptr, *fromOrder = nullptr; int numColors; int verbosity; + std::once_flag pattern_uploaded; ILUReorder opencl_ilu_reorder; typedef struct { - cl::Buffer Lvals, Uvals, invDiagVals; - cl::Buffer Lcols, Lrows; - cl::Buffer Ucols, Urows; + cl::Buffer invDiagVals; + cl::Buffer diagIndex; cl::Buffer rowsPerColor; +#if CHOW_PATEL + cl::Buffer Lvals, Lcols, Lrows; + cl::Buffer Uvals, Ucols, Urows; +#else + cl::Buffer LUvals, LUcols, LUrows; +#endif } GPU_storage; - cl::make_kernel *ILU_apply1; - cl::make_kernel *ILU_apply2; + ilu_apply1_kernel_type *ILU_apply1; + ilu_apply2_kernel_type *ILU_apply2; + cl::make_kernel *ilu_decomp_k; + GPU_storage s; cl::Context *context; cl::CommandQueue *queue; + std::vector events; + cl_int err; int work_group_size = 0; int total_work_items = 0; int lmem_per_work_group = 0; - bool pattern_uploaded = false; ChowPatelIlu chowPatelIlu; @@ -91,8 +121,9 @@ namespace bda void setOpenCLQueue(cl::CommandQueue *queue); void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group); void setKernels( - cl::make_kernel *ILU_apply1, - cl::make_kernel *ILU_apply2 + cl::make_kernel *ILU_apply1, + cl::make_kernel *ILU_apply2, + cl::make_kernel *ilu_decomp_k ); int* getToOrder() diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 39d403cb7..9f1bbca16 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -28,6 +28,19 @@ #include #include +#if HAVE_CUDA +#include +#endif + +#if HAVE_OPENCL +#include +#endif + +#if HAVE_FPGA +#include +#endif + + #define PRINT_TIMERS_BRIDGE 0 typedef Dune::InverseOperatorResult InverseOperatorResult; @@ -41,7 +54,8 @@ namespace Opm using bda::ILUReorder; template -BdaBridge::BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED) +BdaBridge::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED) +: gpu_mode(gpu_mode_) { if (gpu_mode.compare("cusparse") == 0) { #if HAVE_CUDA @@ -224,6 +238,19 @@ void BdaBridge::get_result(BridgeVector } } +template +void BdaBridge::initWellContributions(WellContributions& wellContribs) { + if(gpu_mode.compare("opencl") == 0){ +#if HAVE_OPENCL + const auto openclBackend = static_cast*>(backend.get()); + wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get()); +#else + OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); +#endif + } +} + + #define INSTANTIATE_BDA_FUNCTIONS(n) \ template BdaBridge, std::allocator > >, \ Dune::BlockVector, std::allocator > >, \ @@ -241,6 +268,11 @@ template void BdaBridge, std::al Dune::BlockVector, std::allocator > >, \ n>::get_result \ (Dune::BlockVector, std::allocator > >&); \ + \ +template void BdaBridge, std::allocator > >, \ +Dune::BlockVector, std::allocator > >, \ +n>::initWellContributions(WellContributions&) + INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index 962acc9b4..a8d3bcc1a 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -25,16 +25,10 @@ #include "dune/istl/bcrsmatrix.hh" #include +#include #include #include -#if HAVE_CUDA -#include -#endif - -#if HAVE_OPENCL -#include -#endif namespace Opm { @@ -48,6 +42,7 @@ class BdaBridge { private: bool use_gpu = false; + std::string gpu_mode; std::unique_ptr > backend; public: @@ -79,9 +74,11 @@ public: return use_gpu; } - const bda::BdaSolver& getBackend() const { - return *backend; - } + /// Initialize the WellContributions object with opencl context and queue + /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL + /// \param[in] wellContribs container to hold all WellContributions + void initWellContributions(WellContributions& wellContribs); + }; // end class BdaBridge diff --git a/opm/simulators/linalg/bda/ChowPatelIlu.cpp b/opm/simulators/linalg/bda/ChowPatelIlu.cpp index 894b3f05c..b1ac6448b 100644 --- a/opm/simulators/linalg/bda/ChowPatelIlu.cpp +++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp @@ -529,7 +529,7 @@ void ChowPatelIlu::decomposition( err |= queue->enqueueWriteBuffer(d_LU_cols, CL_FALSE, 0, sizeof(int) * LU_nnzbs, LU_cols, nullptr, &events[5]); cl::WaitForEvents(events); events.clear(); - if (verbosity >= 3){ + if (verbosity >= 4){ std::ostringstream out; out << "ChowPatelIlu copy sparsity pattern time: " << t_copy_pattern.stop() << " s"; OpmLog::info(out.str()); @@ -550,7 +550,7 @@ void ChowPatelIlu::decomposition( err |= queue->enqueueWriteBuffer(d_LU_vals, CL_FALSE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals, nullptr, &events[2]); cl::WaitForEvents(events); events.clear(); - if (verbosity >= 3){ + if (verbosity >= 4){ std::ostringstream out; out << "ChowPatelIlu copy1 time: " << t_copy1.stop() << " s"; OpmLog::info(out.str()); @@ -585,7 +585,7 @@ void ChowPatelIlu::decomposition( d_Ut_idxs, d_L_cols, d_LU_cols, *Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group)); event.wait(); - if (verbosity >= 3){ + if (verbosity >= 4){ std::ostringstream out; out << "ChowPatelIlu sweep kernel time: " << t_kernel.stop() << " s"; OpmLog::info(out.str()); @@ -604,7 +604,7 @@ void ChowPatelIlu::decomposition( } cl::WaitForEvents(events); events.clear(); - if (verbosity >= 3){ + if (verbosity >= 4){ std::ostringstream out; out << "ChowPatelIlu copy2 time: " << t_copy2.stop() << " s"; OpmLog::info(out.str()); diff --git a/opm/simulators/linalg/bda/Reorder.cpp b/opm/simulators/linalg/bda/Reorder.cpp index b7eebee57..cae3b35f9 100644 --- a/opm/simulators/linalg/bda/Reorder.cpp +++ b/opm/simulators/linalg/bda/Reorder.cpp @@ -201,11 +201,10 @@ void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, void colorsToReordering(int Nb, std::vector& colors, int numColors, int *toOrder, int *fromOrder, std::vector& rowsPerColor) { int reordered = 0; - int i, c; // Find reordering patterns - for (c = 0; c < numColors; c++) { - for (i = 0; i < Nb; i++) { + for (int c = 0; c < numColors; c++) { + for (int i = 0; i < Nb; i++) { if (colors[i] == c) { rowsPerColor[c]++; toOrder[i] = reordered; @@ -259,10 +258,11 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd int activeRowIndex = 0, colorEnd, nextActiveRowIndex = 0; int thisRow; std::vector doneRows(Nb, false); - rowsPerColor.reserve(Nb); - std::vector rowsToStart; + // since emplace_back() is used to fill, the vector must be empty + assert(rowsPerColor.size() == 0); + // find starting rows: rows that are independent from all rows that come before them. for (thisRow = 0; thisRow < Nb; thisRow++) { if (canBeStarted(thisRow, CSCColPointers, CSCRowIndices, doneRows)) { @@ -312,8 +312,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd template void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector& rowsPerColor) { - std::vector rowColor; - rowColor.resize(Nb); + std::vector rowColor(Nb); *numColors = colorBlockedNodes(Nb, CSRRowPointers, CSRColIndices, CSCColPointers, CSCRowIndices, rowColor, maxRowsPerColor, maxColsPerColor); diff --git a/opm/simulators/linalg/bda/Reorder.hpp b/opm/simulators/linalg/bda/Reorder.hpp index 7ae2c90db..08c8716ff 100644 --- a/opm/simulators/linalg/bda/Reorder.hpp +++ b/opm/simulators/linalg/bda/Reorder.hpp @@ -90,7 +90,7 @@ bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIn /// \param[out] numColors a pointer to the number of colors needed for the level scheduling /// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved /// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved -/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color +/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector& rowsPerColor); /// Find a graph coloring reordering for an input matrix @@ -105,7 +105,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd /// \param[out] numColors the number of colors used in the found graph coloring /// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved /// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved -/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color +/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function will resize() template void findGraphColoring(const int *CSRColIndices, const int *CSRRowPointers, const int *CSCRowIndices, const int *CSCColPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int *fromOrder, std::vector& rowsPerColor); diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index f17c53c3b..7d9c4dc0b 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -22,7 +22,7 @@ #include #include #include -#include + #include namespace Opm @@ -74,7 +74,7 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu this->queue = queue_; } -void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){ +void WellContributions::setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){ this->kernel = kernel_; this->kernel_no_reorder = kernel_no_reorder_; } diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index f3f8e3fd2..00addf546 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -26,6 +26,7 @@ #if HAVE_OPENCL #include +#include #endif #include @@ -39,6 +40,9 @@ namespace Opm { +using bda::stdwell_apply_kernel_type; +using bda::stdwell_apply_no_reorder_kernel_type; + /// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver /// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used /// So far, StandardWell and MultisegmentWell are supported @@ -92,17 +96,10 @@ private: std::vector multisegments; #if HAVE_OPENCL - typedef cl::make_kernel kernel_type; - typedef cl::make_kernel kernel_type_no_reorder; - cl::Context *context; cl::CommandQueue *queue; - kernel_type *kernel; - kernel_type_no_reorder *kernel_no_reorder; + stdwell_apply_kernel_type *kernel; + stdwell_apply_no_reorder_kernel_type *kernel_no_reorder; std::vector events; std::unique_ptr d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl; @@ -154,7 +151,7 @@ public: #endif #if HAVE_OPENCL - void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_); + void setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_); void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_); /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect diff --git a/opm/simulators/linalg/bda/openclKernels.cpp b/opm/simulators/linalg/bda/openclKernels.cpp new file mode 100644 index 000000000..c82339035 --- /dev/null +++ b/opm/simulators/linalg/bda/openclKernels.cpp @@ -0,0 +1,621 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include + +namespace bda +{ + + std::string get_axpy_string() { + return R"( + __kernel void axpy( + __global double *in, + const double a, + __global double *out, + const int N) + { + unsigned int NUM_THREADS = get_global_size(0); + int idx = get_global_id(0); + + while(idx < N){ + out[idx] += a * in[idx]; + idx += NUM_THREADS; + } + } + )"; + } + + + // returns partial sums, instead of the final dot product + std::string get_dot_1_string() { + return R"( + __kernel void dot_1( + __global double *in1, + __global double *in2, + __global double *out, + const unsigned int N, + __local double *tmp) + { + unsigned int tid = get_local_id(0); + unsigned int bsize = get_local_size(0); + unsigned int bid = get_global_id(0) / bsize; + unsigned int i = get_global_id(0); + unsigned int NUM_THREADS = get_global_size(0); + + double sum = 0.0; + while(i < N){ + sum += in1[i] * in2[i]; + i += NUM_THREADS; + } + tmp[tid] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) + { + if (tid < s) + { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) out[get_group_id(0)] = tmp[0]; + } + )"; + } + + + // returns partial sums, instead of the final norm + // the square root must be computed on CPU + std::string get_norm_string() { + return R"( + __kernel void norm( + __global double *in, + __global double *out, + const unsigned int N, + __local double *tmp) + { + unsigned int tid = get_local_id(0); + unsigned int bsize = get_local_size(0); + unsigned int bid = get_global_id(0) / bsize; + unsigned int i = get_global_id(0); + unsigned int NUM_THREADS = get_global_size(0); + + double local_sum = 0.0; + while(i < N){ + local_sum += in[i] * in[i]; + i += NUM_THREADS; + } + tmp[tid] = local_sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) + { + if (tid < s) + { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) out[get_group_id(0)] = tmp[0]; + } + )"; + } + + + // p = (p - omega * v) * beta + r + std::string get_custom_string() { + return R"( + __kernel void custom( + __global double *p, + __global double *v, + __global double *r, + const double omega, + const double beta, + const int N) + { + const unsigned int NUM_THREADS = get_global_size(0); + unsigned int idx = get_global_id(0); + + while(idx < N){ + double res = p[idx]; + res -= omega * v[idx]; + res *= beta; + res += r[idx]; + p[idx] = res; + idx += NUM_THREADS; + } + } + )"; + } + + std::string get_spmv_blocked_string() { + return R"( + __kernel void spmv_blocked( + __global const double *vals, + __global const int *cols, + __global const int *rows, + const int Nb, + __global const double *x, + __global double *b, + const unsigned int block_size, + __local double *tmp) + { + const unsigned int warpsize = 32; + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + unsigned int idx = idx_b * bsize + idx_t; + const unsigned int bs = block_size; + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int target_block_row = idx / warpsize; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + // for 3x3 blocks: + // num_active_threads: 27 + // num_blocks_per_warp: 3 + + while(target_block_row < Nb){ + unsigned int first_block = rows[target_block_row]; + unsigned int last_block = rows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + for(; block < last_block; block += num_blocks_per_warp){ + double x_elem = x[cols[block]*bs + c]; + double A_elem = vals[block*bs*bs + c + r*bs]; + local_out += x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + unsigned int row = target_block_row*bs + lane; + b[row] = tmp[lane]; + } + target_block_row += num_warps_in_grid; + } + } + )"; + } + + + + std::string get_ILU_apply1_string(bool full_matrix) { + std::string s = R"( + __kernel void ILU_apply1( + __global const double *LUvals, + __global const unsigned int *LUcols, + __global const unsigned int *LUrows, + __global const int *diagIndex, + __global const double *y, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) + { + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx = get_global_id(0); + unsigned int target_block_row = idx / warpsize; + target_block_row += nodesPerColorPrefix[color]; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + const unsigned int first_block = LUrows[target_block_row]; + )"; + if (full_matrix) { + s += "const unsigned int last_block = diagIndex[target_block_row]; "; + } else { + s += "const unsigned int last_block = LUrows[target_block_row+1]; "; + } + s += R"( + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + if(lane < num_active_threads){ + if(lane < bs){ + local_out = y[target_block_row*bs+lane]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + const unsigned int row = target_block_row*bs + lane; + x[row] = tmp[lane]; + } + + target_block_row += num_warps_in_grid; + } + } + )"; + return s; + } + + + std::string get_ILU_apply2_string(bool full_matrix) { + std::string s = R"( + __kernel void ILU_apply2( + __global const double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global const int *diagIndex, + __global const double *invDiagVals, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) + { + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx = get_global_id(0); + unsigned int target_block_row = idx / warpsize; + target_block_row += nodesPerColorPrefix[color]; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + const double relaxation = 0.9; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + )"; + if (full_matrix) { + s += "const unsigned int first_block = diagIndex[target_block_row] + 1; "; + } else { + s += "const unsigned int first_block = LUrows[target_block_row]; "; + } + s += R"( + const unsigned int last_block = LUrows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + if(lane < num_active_threads){ + if(lane < bs){ + const unsigned int row = target_block_row*bs+lane; + local_out = x[row]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + local_out = tmp[lane]; + + if(lane < bs){ + tmp[lane + bs*idx_t/warpsize] = local_out; + double sum = 0.0; + for(int i = 0; i < bs; ++i){ + sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; + } + + const unsigned int row = target_block_row*bs + lane; + x[row] = relaxation * sum; + } + + target_block_row += num_warps_in_grid; + } + } + )"; + return s; + } + + std::string get_stdwell_apply_string(bool reorder) { + std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder"; + std::string s = "__kernel void " + kernel_name + R"(( + __global const double *Cnnzs, + __global const double *Dnnzs, + __global const double *Bnnzs, + __global const int *Ccols, + __global const int *Bcols, + __global const double *x, + __global double *y, + )"; + if (reorder) { + s += R"(__global const int *toOrder, + )"; + } + s += R"(const unsigned int dim, + const unsigned int dim_wells, + __global const unsigned int *val_pointers, + __local double *localSum, + __local double *z1, + __local double *z2){ + int wgId = get_group_id(0); + int wiId = get_local_id(0); + int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; + int valsPerBlock = dim*dim_wells; + int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; + int numBlocksPerWarp = get_local_size(0)/valsPerBlock; + int c = wiId % dim; + int r = (wiId/dim) % dim_wells; + double temp; + + barrier(CLK_LOCAL_MEM_FENCE); + + localSum[wiId] = 0; + if(wiId < numActiveWorkItems){ + int b = wiId/valsPerBlock + val_pointers[wgId]; + while(b < valSize + val_pointers[wgId]){ + )"; + if (reorder) { + s += "int colIdx = toOrder[Bcols[b]]; "; + } else { + s += "int colIdx = Bcols[b]; "; + } + s += R"( + localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; + b += numBlocksPerWarp; + } + + if(wiId < valsPerBlock){ + localSum[wiId] += localSum[wiId + valsPerBlock]; + } + + b = wiId/valsPerBlock + val_pointers[wgId]; + + if(c == 0 && wiId < valsPerBlock){ + for(unsigned int stride = 2; stride > 0; stride >>= 1){ + localSum[wiId] += localSum[wiId + stride]; + } + z1[r] = localSum[wiId]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim_wells){ + temp = 0.0; + for(unsigned int i = 0; i < dim_wells; ++i){ + temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; + } + z2[wiId] = temp; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim*valSize){ + temp = 0.0; + int bb = wiId/dim + val_pointers[wgId]; + for (unsigned int j = 0; j < dim_wells; ++j){ + temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; + } + )"; + if (reorder) { + s += "int colIdx = toOrder[Ccols[bb]]; "; + } else { + s += "int colIdx = Ccols[bb]; "; + } + s += R"( + y[colIdx*dim + c] -= temp; + } + } + )"; + return s; + } + + + std::string get_ilu_decomp_string() { + return R"( + + // a = a - (b * c) + __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c) + { + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += b[block_size * row + k] * c[block_size * k + col]; + } + a[block_size * row + col] -= temp; + } + } + + // c = a * b + __kernel void block_mult(__global double *a, __global double *b, __local double *c) { + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += a[block_size * row + k] * b[block_size * k + col]; + } + c[block_size * row + col] = temp; + } + } + + // invert 3x3 matrix + __kernel void inverter(__global double *matrix, __global double *inverse) { + const unsigned int block_size = 3; + const unsigned int bs = block_size; // rename to shorter name + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < bs * bs){ + double t4 = matrix[0] * matrix[4]; + double t6 = matrix[0] * matrix[5]; + double t8 = matrix[1] * matrix[3]; + double t10 = matrix[2] * matrix[3]; + double t12 = matrix[1] * matrix[6]; + double t14 = matrix[2] * matrix[6]; + + double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + + t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); + double t17 = 1.0 / det; + + const unsigned int r = thread_id_in_hwarp / bs; + const unsigned int c = thread_id_in_hwarp % bs; + const unsigned int r1 = (r+1) % bs; + const unsigned int c1 = (c+1) % bs; + const unsigned int r2 = (r+bs-1) % bs; + const unsigned int c2 = (c+bs-1) % bs; + inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17; + } + } + + __kernel void ilu_decomp(const unsigned int firstRow, + const unsigned int lastRow, + __global double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global double *invDiagVals, + __global int *diagIndex, + const unsigned int Nb, + __local double *pivot){ + + const unsigned int bs = 3; + const unsigned int hwarp_size = 16; + const unsigned int work_group_size = get_local_size(0); + const unsigned int work_group_id = get_group_id(0); + const unsigned int num_groups = get_num_groups(0); + const unsigned int hwarps_per_group = work_group_size / hwarp_size; + const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads) + const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size; + const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it + // every workitem in a hwarp has the same lmem_offset + + // go through all rows + for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) + { + int iRowStart = LUrows[i]; + int iRowEnd = LUrows[i + 1]; + + // go through all elements of the row + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUcols[ij]; + + if (j < i) { + // calculate the pivot of this row + block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); + + // copy pivot + if (thread_id_in_hwarp < bs * bs) { + LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp]; + } + + int jRowEnd = LUrows[j + 1]; + int jk = diagIndex[j] + 1; + int ik = ij + 1; + // substract that row scaled by the pivot from this row. + while (ik < iRowEnd && jk < jRowEnd) { + if (LUcols[ik] == LUcols[jk]) { + block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs); + ik++; + jk++; + } else { + if (LUcols[ik] < LUcols[jk]) + { ik++; } + else + { jk++; } + } + } + } + } + + // store the inverse in the diagonal + inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); + + // copy inverse + if (thread_id_in_hwarp < bs * bs) { + LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; + } + + } + } + )"; + } + +} // end namespace bda + diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index e3ffadf60..a408cf9ec 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -20,498 +20,76 @@ #ifndef OPENCL_HPP #define OPENCL_HPP +#include + +#include + namespace bda { - inline const char* axpy_s = R"( - __kernel void axpy( - __global double *in, - const double a, - __global double *out, - const int N) - { - unsigned int NUM_THREADS = get_global_size(0); - int idx = get_global_id(0); +using spmv_kernel_type = cl::make_kernel; +using ilu_apply1_kernel_type = cl::make_kernel; +using ilu_apply2_kernel_type = cl::make_kernel; +using stdwell_apply_kernel_type = cl::make_kernel; +using stdwell_apply_no_reorder_kernel_type = cl::make_kernel; +using ilu_decomp_kernel_type = cl::make_kernel; - while(idx < N){ - out[idx] += a * in[idx]; - idx += NUM_THREADS; - } - } - )"; + /// Generate string with axpy kernel + /// a = a + alpha * b + std::string get_axpy_string(); + /// returns partial sums, instead of the final dot product + /// partial sums are added on CPU + std::string get_dot_1_string(); - // returns partial sums, instead of the final dot product - inline const char* dot_1_s = R"( - __kernel void dot_1( - __global double *in1, - __global double *in2, - __global double *out, - const unsigned int N, - __local double *tmp) - { - unsigned int tid = get_local_id(0); - unsigned int bsize = get_local_size(0); - unsigned int bid = get_global_id(0) / bsize; - unsigned int i = get_global_id(0); - unsigned int NUM_THREADS = get_global_size(0); + /// returns partial sums, instead of the final norm + /// the square root must be computed on CPU + std::string get_norm_string(); - double sum = 0.0; - while(i < N){ - sum += in1[i] * in2[i]; - i += NUM_THREADS; - } - tmp[tid] = sum; + /// Generate string with custom kernel + /// This kernel combines some ilubicgstab vector operations into 1 + /// p = (p - omega * v) * beta + r + std::string get_custom_string(); - barrier(CLK_LOCAL_MEM_FENCE); + /// b = mat * x + /// algorithm based on: + /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, + /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 + std::string get_spmv_blocked_string(); - // do reduction in shared mem - for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) - { - if (tid < s) - { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } + /// ILU apply part 1: forward substitution + /// solves L*x=y where L is a lower triangular sparse blocked matrix + /// this L can be it's own BSR matrix (if full_matrix is false), + /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end + /// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not + std::string get_ILU_apply1_string(bool full_matrix); - // write result for this block to global mem - if (tid == 0) out[get_group_id(0)] = tmp[0]; - } - )"; + /// ILU apply part 2: backward substitution + /// solves U*x=y where U is an upper triangular sparse blocked matrix + /// this U can be it's own BSR matrix (if full_matrix is false), + /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start + /// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not + std::string get_ILU_apply2_string(bool full_matrix); + /// Generate string with the stdwell_apply kernels + /// If reorder is true, the B/Ccols do not correspond with the x/y vector + /// the x/y vector is reordered, use toOrder to address that + /// \param[in] reorder whether the matrix is reordered or not + std::string get_stdwell_apply_string(bool reorder); - // returns partial sums, instead of the final norm - // the square root must be computed on CPU - inline const char* norm_s = R"( - __kernel void norm( - __global double *in, - __global double *out, - const unsigned int N, - __local double *tmp) - { - unsigned int tid = get_local_id(0); - unsigned int bsize = get_local_size(0); - unsigned int bid = get_global_id(0) / bsize; - unsigned int i = get_global_id(0); - unsigned int NUM_THREADS = get_global_size(0); - - double local_sum = 0.0; - while(i < N){ - local_sum += in[i] * in[i]; - i += NUM_THREADS; - } - tmp[tid] = local_sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - // do reduction in shared mem - for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) - { - if (tid < s) - { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // write result for this block to global mem - if (tid == 0) out[get_group_id(0)] = tmp[0]; - } - )"; - - - // p = (p - omega * v) * beta + r - inline const char* custom_s = R"( - __kernel void custom( - __global double *p, - __global double *v, - __global double *r, - const double omega, - const double beta, - const int N) - { - const unsigned int NUM_THREADS = get_global_size(0); - unsigned int idx = get_global_id(0); - - while(idx < N){ - double res = p[idx]; - res -= omega * v[idx]; - res *= beta; - res += r[idx]; - p[idx] = res; - idx += NUM_THREADS; - } - } - )"; - - - // b = mat * x - // algorithm based on: - // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, - // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 - inline const char* spmv_blocked_s = R"( - __kernel void spmv_blocked( - __global const double *vals, - __global const int *cols, - __global const int *rows, - const int Nb, - __global const double *x, - __global double *b, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bsize = get_local_size(0); - const unsigned int idx_b = get_global_id(0) / bsize; - const unsigned int idx_t = get_local_id(0); - unsigned int idx = idx_b * bsize + idx_t; - const unsigned int bs = block_size; - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int target_block_row = idx / warpsize; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - - // for 3x3 blocks: - // num_active_threads: 27 - // num_blocks_per_warp: 3 - - while(target_block_row < Nb){ - unsigned int first_block = rows[target_block_row]; - unsigned int last_block = rows[target_block_row+1]; - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - - if(lane < num_active_threads){ - for(; block < last_block; block += num_blocks_per_warp){ - double x_elem = x[cols[block]*bs + c]; - double A_elem = vals[block*bs*bs + c + r*bs]; - local_out += x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(lane < bs){ - unsigned int row = target_block_row*bs + lane; - b[row] = tmp[lane]; - } - target_block_row += num_warps_in_grid; - } - } - )"; - - - - // ILU apply part 1: forward substitution - // solves L*x=y where L is a lower triangular sparse blocked matrix - inline const char* ILU_apply1_s = R"( - __kernel void ILU_apply1( - __global const double *Lvals, - __global const unsigned int *Lcols, - __global const unsigned int *Lrows, - const unsigned int LrowSize, - __global const double *y, - __global double *x, - __global const unsigned int *nodesPerColorPrefix, - const unsigned int color, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bs = block_size; - const unsigned int idx_t = get_local_id(0); - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int idx = get_global_id(0); - unsigned int target_block_row = idx / warpsize; - idx += nodesPerColorPrefix[color]; - target_block_row += nodesPerColorPrefix[color]; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - - while(target_block_row < nodesPerColorPrefix[color+1]){ - const unsigned int first_block = Lrows[target_block_row]; - const unsigned int last_block = Lrows[target_block_row+1]; - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - if(lane < num_active_threads){ - if(lane < bs){ - local_out = y[target_block_row*bs+lane]; - } - for(; block < last_block; block += num_blocks_per_warp){ - const double x_elem = x[Lcols[block]*bs + c]; - const double A_elem = Lvals[block*bs*bs + c + r*bs]; - local_out -= x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(lane < bs){ - const unsigned int row = target_block_row*bs + lane; - x[row] = tmp[lane]; - } - - target_block_row += num_warps_in_grid; - } - } - )"; - - - // ILU apply part 2: backward substitution - // solves U*x=y where L is a lower triangular sparse blocked matrix - inline const char* ILU_apply2_s = R"( - __kernel void ILU_apply2( - __global const double *Uvals, - __global const int *Ucols, - __global const int *Urows, - const unsigned int UrowSize, - __global const double *invDiagVals, - __global double *x, - __global const unsigned int *nodesPerColorPrefix, - const unsigned int color, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bs = block_size; - const unsigned int idx_t = get_local_id(0); - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int idx = get_global_id(0); - unsigned int target_block_row = idx / warpsize; - idx += nodesPerColorPrefix[color]; - target_block_row += nodesPerColorPrefix[color]; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - const double relaxation = 0.9; - - while(target_block_row < nodesPerColorPrefix[color+1]){ - const unsigned int first_block = Urows[UrowSize-target_block_row-1]; - const unsigned int last_block = Urows[UrowSize-target_block_row]; - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - if(lane < num_active_threads){ - if(lane < bs){ - const unsigned int row = target_block_row*bs+lane; - local_out = x[row]; - } - for(; block < last_block; block += num_blocks_per_warp){ - const double x_elem = x[Ucols[block]*bs + c]; - const double A_elem = Uvals[block*bs*bs + c + r*bs]; - local_out -= x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - local_out = tmp[lane]; - - if(lane < bs){ - tmp[lane + bs*idx_t/warpsize] = local_out; - double sum = 0.0; - for(int i = 0; i < bs; ++i){ - sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; - } - - const unsigned int row = target_block_row*bs + lane; - x[row] = relaxation * sum; - } - - target_block_row += num_warps_in_grid; - } - } - )"; - - inline const char* stdwell_apply_s = R"( - __kernel void stdwell_apply(__global const double *Cnnzs, - __global const double *Dnnzs, - __global const double *Bnnzs, - __global const int *Ccols, - __global const int *Bcols, - __global const double *x, - __global double *y, - __global const int *toOrder, - const unsigned int dim, - const unsigned int dim_wells, - __global const unsigned int *val_pointers, - __local double *localSum, - __local double *z1, - __local double *z2){ - int wgId = get_group_id(0); - int wiId = get_local_id(0); - int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; - int valsPerBlock = dim*dim_wells; - int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; - int numBlocksPerWarp = get_local_size(0)/valsPerBlock; - int c = wiId % dim; - int r = (wiId/dim) % dim_wells; - double temp; - - barrier(CLK_LOCAL_MEM_FENCE); - - localSum[wiId] = 0; - if(wiId < numActiveWorkItems){ - int b = wiId/valsPerBlock + val_pointers[wgId]; - while(b < valSize + val_pointers[wgId]){ - int colIdx = toOrder[Bcols[b]]; - localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; - b += numBlocksPerWarp; - } - - if(wiId < valsPerBlock){ - localSum[wiId] += localSum[wiId + valsPerBlock]; - } - - b = wiId/valsPerBlock + val_pointers[wgId]; - - if(c == 0 && wiId < valsPerBlock){ - for(unsigned int stride = 2; stride > 0; stride >>= 1){ - localSum[wiId] += localSum[wiId + stride]; - } - z1[r] = localSum[wiId]; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim_wells){ - temp = 0.0; - for(unsigned int i = 0; i < dim_wells; ++i){ - temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; - } - z2[wiId] = temp; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim*valSize){ - temp = 0.0; - int bb = wiId/dim + val_pointers[wgId]; - for (unsigned int j = 0; j < dim_wells; ++j){ - temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; - } - int colIdx = toOrder[Ccols[bb]]; - y[colIdx*dim + c] -= temp; - } - } - )"; - - inline const char* stdwell_apply_no_reorder_s = R"( - __kernel void stdwell_apply_no_reorder(__global const double *Cnnzs, - __global const double *Dnnzs, - __global const double *Bnnzs, - __global const int *Ccols, - __global const int *Bcols, - __global const double *x, - __global double *y, - const unsigned int dim, - const unsigned int dim_wells, - __global const unsigned int *val_pointers, - __local double *localSum, - __local double *z1, - __local double *z2){ - int wgId = get_group_id(0); - int wiId = get_local_id(0); - int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; - int valsPerBlock = dim*dim_wells; - int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; - int numBlocksPerWarp = get_local_size(0)/valsPerBlock; - int c = wiId % dim; - int r = (wiId/dim) % dim_wells; - double temp; - - barrier(CLK_LOCAL_MEM_FENCE); - - localSum[wiId] = 0; - if(wiId < numActiveWorkItems){ - int b = wiId/valsPerBlock + val_pointers[wgId]; - while(b < valSize + val_pointers[wgId]){ - int colIdx = Bcols[b]; - localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; - b += numBlocksPerWarp; - } - - if(wiId < valsPerBlock){ - localSum[wiId] += localSum[wiId + valsPerBlock]; - } - - b = wiId/valsPerBlock + val_pointers[wgId]; - - if(c == 0 && wiId < valsPerBlock){ - for(unsigned int stride = 2; stride > 0; stride >>= 1){ - localSum[wiId] += localSum[wiId + stride]; - } - z1[r] = localSum[wiId]; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim_wells){ - temp = 0.0; - for(unsigned int i = 0; i < dim_wells; ++i){ - temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; - } - z2[wiId] = temp; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim*valSize){ - temp = 0.0; - int bb = wiId/dim + val_pointers[wgId]; - for (unsigned int j = 0; j < dim_wells; ++j){ - temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; - } - int colIdx = Ccols[bb]; - y[colIdx*dim + c] -= temp; - } - } - )"; + /// Generate string with the exact ilu decomposition kernel + /// The kernel takes a full BSR matrix and performs inplace ILU decomposition + std::string get_ilu_decomp_string(); } // end namespace bda diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 43ba050e2..e71f5c9c1 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -26,7 +26,6 @@ #include #include -#include #include #include @@ -46,7 +45,6 @@ template openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) { prec = new Preconditioner(opencl_ilu_reorder, verbosity_); - cl_int err = CL_SUCCESS; std::ostringstream out; try { std::vector platforms; @@ -332,20 +330,23 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr //applyblockedscaleadd(-1.0, mat, x, r); // set initial values - cl::Event event; - queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N); - queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event); - event.wait(); + events.resize(5); + queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]); + queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]); rho = 1.0; alpha = 1.0; omega = 1.0; - queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event); - event.wait(); - queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event); - event.wait(); - queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event); - event.wait(); + queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]); + queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]); + queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]); + + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error"); + } norm = norm_w(d_r, d_tmp); norm_0 = norm; @@ -475,18 +476,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub out.clear(); try { - cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector >' - source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s))); - source.emplace_back(std::make_pair(norm_s, strlen(norm_s))); - source.emplace_back(std::make_pair(custom_s, strlen(custom_s))); - source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s))); - source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s))); - source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s))); - source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s))); - source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s))); - program = cl::Program(*context, source); - program.build(devices); - prec->setOpenCLContext(context.get()); prec->setOpenCLQueue(queue.get()); @@ -518,27 +507,9 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); } - // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA - // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking + get_opencl_kernels(); - // actually creating the kernels - dot_k.reset(new cl::make_kernel(cl::Kernel(program, "dot_1"))); - norm_k.reset(new cl::make_kernel(cl::Kernel(program, "norm"))); - axpy_k.reset(new cl::make_kernel(cl::Kernel(program, "axpy"))); - custom_k.reset(new cl::make_kernel(cl::Kernel(program, "custom"))); - spmv_blocked_k.reset(new cl::make_kernel(cl::Kernel(program, "spmv_blocked"))); - ILU_apply1_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply1"))); - ILU_apply2_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply2"))); - stdwell_apply_k.reset(new cl::make_kernel(cl::Kernel(program, "stdwell_apply"))); - stdwell_apply_no_reorder_k.reset(new cl::make_kernel(cl::Kernel(program, "stdwell_apply_no_reorder"))); - - prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); + prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get()); } catch (const cl::Error& error) { std::ostringstream oss; @@ -554,6 +525,59 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub initialized = true; } // end initialize() +void add_kernel_string(cl::Program::Sources &sources, std::string &source) { + sources.emplace_back(std::make_pair(source.c_str(), source.size())); +} + +template +void openclSolverBackend::get_opencl_kernels() { + + cl::Program::Sources sources; + std::string axpy_s = get_axpy_string(); + add_kernel_string(sources, axpy_s); + std::string dot_1_s = get_dot_1_string(); + add_kernel_string(sources, dot_1_s); + std::string norm_s = get_norm_string(); + add_kernel_string(sources, norm_s); + std::string custom_s = get_custom_string(); + add_kernel_string(sources, custom_s); + std::string spmv_blocked_s = get_spmv_blocked_string(); + add_kernel_string(sources, spmv_blocked_s); +#if CHOW_PATEL + bool ilu_operate_on_full_matrix = false; +#else + bool ilu_operate_on_full_matrix = true; +#endif + std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix); + add_kernel_string(sources, ILU_apply1_s); + std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix); + add_kernel_string(sources, ILU_apply2_s); + std::string stdwell_apply_s = get_stdwell_apply_string(true); + add_kernel_string(sources, stdwell_apply_s); + std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false); + add_kernel_string(sources, stdwell_apply_no_reorder_s); + std::string ilu_decomp_s = get_ilu_decomp_string(); + add_kernel_string(sources, ilu_decomp_s); + + cl::Program program = cl::Program(*context, sources); + program.build(devices); + + // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA + // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking + + // actually creating the kernels + dot_k.reset(new cl::make_kernel(cl::Kernel(program, "dot_1"))); + norm_k.reset(new cl::make_kernel(cl::Kernel(program, "norm"))); + axpy_k.reset(new cl::make_kernel(cl::Kernel(program, "axpy"))); + custom_k.reset(new cl::make_kernel(cl::Kernel(program, "custom"))); + spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked"))); + ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1"))); + ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2"))); + stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply"))); + stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder"))); + ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp"))); +} // end get_opencl_kernels() + template void openclSolverBackend::finalize() { if (opencl_ilu_reorder != ILUReorder::NONE) { @@ -569,7 +593,7 @@ void openclSolverBackend::finalize() { template void openclSolverBackend::copy_system_to_gpu() { Timer t; - cl::Event event; + events.resize(5); #if COPY_ROW_BY_ROW int sum = 0; @@ -578,19 +602,25 @@ void openclSolverBackend::copy_system_to_gpu() { memcpy(vals_contiguous + sum, reinterpret_cast(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size); sum += size_row * block_size * block_size; } - queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous); + err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]); #else - queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues); + err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]); #endif - queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices); - queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers); - queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb); + err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]); + err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]); + err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]); + err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]); if (opencl_ilu_reorder != ILUReorder::NONE) { - queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder); + events.resize(6); + queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]); + } + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error"); } - queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); - event.wait(); if (verbosity > 2) { std::ostringstream out; @@ -603,7 +633,7 @@ void openclSolverBackend::copy_system_to_gpu() { template void openclSolverBackend::update_system_on_gpu() { Timer t; - cl::Event event; + events.resize(3); #if COPY_ROW_BY_ROW int sum = 0; @@ -612,14 +642,19 @@ void openclSolverBackend::update_system_on_gpu() { memcpy(vals_contiguous + sum, reinterpret_cast(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size); sum += size_row * block_size * block_size; } - queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous); + err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]); #else - queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues); + err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]); #endif - queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb); - queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); - event.wait(); + err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]); + err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]); + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error"); + } if (verbosity > 2) { std::ostringstream out; diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index aae527643..b57a8dfd6 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -21,6 +21,7 @@ #define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED #include +#include #include #include #include @@ -64,22 +65,16 @@ private: // shared pointers are also passed to other objects std::vector devices; - cl::Program program; std::unique_ptr > dot_k; std::unique_ptr > norm_k; std::unique_ptr > axpy_k; std::unique_ptr > custom_k; - std::unique_ptr > spmv_blocked_k; - std::shared_ptr > ILU_apply1_k; - std::shared_ptr > ILU_apply2_k; - std::shared_ptr > stdwell_apply_k; - std::shared_ptr > stdwell_apply_no_reorder_k; + std::unique_ptr spmv_blocked_k; + std::shared_ptr ILU_apply1_k; + std::shared_ptr ILU_apply2_k; + std::shared_ptr stdwell_apply_k; + std::shared_ptr stdwell_apply_no_reorder_k; + std::shared_ptr ilu_decomp_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings @@ -87,6 +82,8 @@ private: std::unique_ptr > mat = nullptr; // original matrix BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv ILUReorder opencl_ilu_reorder; // reordering strategy + std::vector events; + cl_int err; /// Divide A by B, and round up: return (int)ceil(A/B) /// \param[in] A dividend @@ -147,6 +144,9 @@ private: /// \param[in] cols array of columnIndices, contains nnz values void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); + /// Generate and compile opencl kernels + void get_opencl_kernels(); + /// Clean memory void finalize();