From 9f92a690377cfff031e506d2fe154738dd8ffa13 Mon Sep 17 00:00:00 2001 From: tqiu Date: Thu, 17 Dec 2020 14:49:59 +0100 Subject: [PATCH 01/14] Added CPU and GPU implementations of Fine-Grained Parallel ILU (FGPILU) --- CMakeLists_files.cmake | 2 + opm/simulators/linalg/bda/BILU0.cpp | 262 ++++++++- opm/simulators/linalg/bda/BILU0.hpp | 5 + opm/simulators/linalg/bda/BdaBridge.cpp | 2 + opm/simulators/linalg/bda/BlockedMatrix.cpp | 12 + opm/simulators/linalg/bda/BlockedMatrix.hpp | 8 + opm/simulators/linalg/bda/ILUReorder.hpp | 3 +- opm/simulators/linalg/bda/fgpilu.cpp | 570 ++++++++++++++++++++ opm/simulators/linalg/bda/fgpilu.hpp | 86 +++ opm/simulators/linalg/bda/opencl.cpp | 3 + 10 files changed, 949 insertions(+), 4 deletions(-) create mode 100644 opm/simulators/linalg/bda/fgpilu.cpp create mode 100644 opm/simulators/linalg/bda/fgpilu.hpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 70cc1c3df..70cb66a2a 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -58,6 +58,7 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/fgpilu.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) @@ -168,6 +169,7 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/BlockedMatrix.hpp opm/simulators/linalg/bda/cuda_header.hpp opm/simulators/linalg/bda/cusparseSolverBackend.hpp + opm/simulators/linalg/bda/fgpilu.hpp opm/simulators/linalg/bda/Reorder.hpp opm/simulators/linalg/bda/ILUReorder.hpp opm/simulators/linalg/bda/opencl.hpp diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 24905520d..dedb7d222 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -25,12 +25,21 @@ #include #include +#include #include namespace bda { +// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU +// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in: +// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 +// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU +// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU +#define CHOW_PATEL 1 +#define CHOW_PATEL_GPU 1 + using Opm::OpmLog; using Dune::Timer; @@ -82,11 +91,26 @@ namespace bda } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { out << "BILU0 reordering strategy: " << "graph_coloring\n"; findGraphColoring(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor); + } else if (opencl_ilu_reorder == ILUReorder::NONE) { + out << "BILU0 reordering strategy: none\n"; + numColors = 1; + rowsPerColor.emplace_back(Nb); + // numColors = Nb; + // for(int i = 0; i < Nb; ++i){ + // rowsPerColor.emplace_back(1); + // } + for(int i = 0; i < Nb; ++i){ + toOrder[i] = i; + fromOrder[i] = i; + } } else { OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n"); } if(verbosity >= 3){ - out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors"; + out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n"; + } + if(verbosity >= 2){ + out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU; } OpmLog::info(out.str()); @@ -129,6 +153,233 @@ namespace bda return true; } // end init() + // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015 + template + void BILU0::chow_patel_decomposition() + { + const unsigned int bs = block_size; + int num_sweeps = 6; + + // split matrix into L and U + // also convert U into BSC format (Ut) + // Ut stores diagonal for now + int num_blocks_L = 0; + + // Ut is actually BSC format + std::unique_ptr > Ut = std::make_unique >(Umat->Nb, Umat->nnzbs + Umat->Nb); + + Lmat->rowPointers[0] = 0; + for (int i = 0; i < Nb+1; i++) { + Ut->rowPointers[i] = 0; + } + + // for every row + for (int i = 0; i < Nb; i++) { + int iRowStart = LUmat->rowPointers[i]; + int iRowEnd = LUmat->rowPointers[i + 1]; + // for every block in this row + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUmat->colIndices[ij]; + if (i <= j) { + Ut->rowPointers[j+1]++; // actually colPointers + } else { + Lmat->colIndices[num_blocks_L] = j; + memcpy(Lmat->nnzValues + num_blocks_L * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs); + num_blocks_L++; + } + } + Lmat->rowPointers[i+1] = num_blocks_L; + } + + // prefix sum + int sum = 0; + for (int i = 1; i < Nb+1; i++) { + sum += Ut->rowPointers[i]; + Ut->rowPointers[i] = sum; + } + + // for every row + for (int i = 0; i < Nb; i++) { + int iRowStart = LUmat->rowPointers[i]; + int iRowEnd = LUmat->rowPointers[i + 1]; + // for every block in this row + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUmat->colIndices[ij]; + if (i <= j){ + int idx = Ut->rowPointers[j]++; + Ut->colIndices[idx] = i; // actually rowIndices + memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs); + } + } + } + + // rotate + // the Ut->rowPointers were increased in the last loop + for (int i = Nb; i > 0; --i) { + Ut->rowPointers[i] = Ut->rowPointers[i-1]; + } + Ut->rowPointers[0] = 0; + + Opm::Detail::Inverter inverter; + + // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed at the end + // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp + double *Utmp = new double[Ut->nnzbs * block_size * block_size]; + + // actual ILU decomposition +#if CHOW_PATEL_GPU + fgpilu.decomposition(queue, context, + Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs, + Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs, + LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs, + Nb, num_sweeps, verbosity); +#else + double *Ltmp = new double[Lmat->nnzbs * block_size * block_size]; + for (int sweep = 0; sweep < num_sweeps; ++sweep) { + + // for every row + for (int row = 0; row < Nb; row++) { + // update U + int jColStart = Ut->rowPointers[row]; + int jColEnd = Ut->rowPointers[row + 1]; + // for every block in this row + for (int ij = jColStart; ij < jColEnd; ij++) { + int col = Ut->colIndices[ij]; + // refine Uij element (or diagonal) + int i1 = LUmat->rowPointers[col]; + int i2 = LUmat->rowPointers[col+1]; + int kk = 0; + for(kk = i1; kk < i2; ++kk) { + ptrdiff_t c = LUmat->colIndices[kk]; + if (c >= row) { + break; + } + } + double aij[bs*bs]; + memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs); + int jk = Lmat->rowPointers[col]; + int ik = (jk < Lmat->rowPointers[col+1]) ? Lmat->colIndices[jk] : Nb; + + for (int k = jColStart; k < ij; ++k) { + int ki = Ut->colIndices[k]; + while (ik < ki) { + ++jk; + ik = Lmat->colIndices[jk]; + } + if (ik == ki) { + blockMultSub(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs); + } + } + + memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs); + } + + // update L + int iRowStart = Lmat->rowPointers[row]; + int iRowEnd = Lmat->rowPointers[row + 1]; + + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = Lmat->colIndices[ij]; + // refine Lij element + int i1 = LUmat->rowPointers[row]; + int i2 = LUmat->rowPointers[row+1]; + int kk = 0; + for(kk = i1; kk < i2; ++kk) { + ptrdiff_t c = LUmat->colIndices[kk]; + if (c >= j) { + break; + } + } + double aij[bs*bs]; + memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs); + int jk = Ut->rowPointers[j]; + int ik = Ut->colIndices[jk]; + for (int k = iRowStart; k < ij; ++k) { + int ki = Lmat->colIndices[k]; + while(ik < ki) { + ++jk; + ik = Ut->colIndices[jk]; + } + + if(ik == ki) { + blockMultSub(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs); + } + } + // calculate aij / ujj + double ujj[bs*bs]; + inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]); + // lij = aij / ujj + blockMult(&aij[0], &ujj[0], Ltmp + ij * bs * bs); + + } + } + double *t = Lmat->nnzValues; + Lmat->nnzValues = Ltmp; + Ltmp = t; + t = Ut->nnzValues; + Ut->nnzValues = Utmp; + Utmp = t; + } // end sweep + delete[] Ltmp; +#endif + + // convert Ut to BSR + // diagonal stored separately + std::vector ptr(Nb+1, 0); + std::vector col(Ut->rowPointers[Nb]); + + // count blocks per row for U (BSR) + // store diagonal in invDiagVals + for(int i = 0; i < Nb; ++i) { + for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) { + int j = Ut->colIndices[k]; + if (j == i) { + inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs); + } else { + ++ptr[j+1]; + } + } + } + + // prefix sum + int sumU = 0; + for (int i = 1; i < Nb+1; i++) { + sumU += ptr[i]; + ptr[i] = sumU; + } + + // actually copy nonzero values for U + for(int i = 0; i < Nb; ++i) { + for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) { + int j = Ut->colIndices[k]; + if (j != i) { + int head = ptr[j]++; + col[head] = i; + memcpy(Utmp + head * bs * bs, Ut->nnzValues + k * bs * bs, sizeof(double) * bs * bs); + } + } + } + + // the ptr[] were increased in the last loop + std::rotate(ptr.begin(), ptr.end() - 1, ptr.end()); + ptr.front() = 0; + + // reversing the rows of U, because that is the order they are used in + int URowIndex = 0; + int offsetU = 0; // number of nnz blocks that are already copied to Umat + Umat->rowPointers[0] = 0; + for (int i = LUmat->Nb - 1; i >= 0; i--) { + int rowSize = ptr[i + 1] - ptr[i]; // number of blocks in this row + memcpy(Umat->nnzValues + offsetU * bs * bs, Utmp + ptr[i] * bs * bs, sizeof(double) * bs * bs * rowSize); + memcpy(Umat->colIndices + offsetU, col.data() + ptr[i], sizeof(int) * rowSize); + offsetU += rowSize; + Umat->rowPointers[URowIndex + 1] = offsetU; + URowIndex++; + } + + delete[] Utmp; + + } template bool BILU0::create_preconditioner(BlockedMatrix *mat) @@ -153,6 +404,10 @@ namespace bda OpmLog::info(out.str()); } + Timer t_decomposition; +#if CHOW_PATEL + chow_patel_decomposition(); +#else int i, j, ij, ik, jk; int iRowStart, iRowEnd, jRowEnd; double pivot[bs * bs]; @@ -160,8 +415,6 @@ namespace bda int LSize = 0; Opm::Detail::Inverter inverter; // reuse inverter to invert blocks - Timer t_decomposition; - // go through all rows for (i = 0; i < LUmat->Nb; i++) { iRowStart = LUmat->rowPointers[i]; @@ -239,6 +492,7 @@ namespace bda Umat->rowPointers[URowIndex + 1] = offsetU; URowIndex++; } +#endif if (verbosity >= 3) { std::ostringstream out; out << "BILU0 decomposition: " << t_decomposition.stop() << " s"; @@ -278,6 +532,7 @@ namespace bda event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); // event.wait(); } + for(int color = numColors-1; color >= 0; --color){ event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); // event.wait(); @@ -320,6 +575,7 @@ namespace bda template BILU0::BILU0(ILUReorder, int); \ template BILU0::~BILU0(); \ template bool BILU0::init(BlockedMatrix*); \ +template void BILU0::chow_patel_decomposition(); \ template bool BILU0::create_preconditioner(BlockedMatrix*); \ template void BILU0::apply(cl::Buffer& x, cl::Buffer& y); \ template void BILU0::setOpenCLContext(cl::Context*); \ diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index bad6201ca..5a31c57a9 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -24,6 +24,7 @@ #include #include +#include namespace bda { @@ -67,6 +68,10 @@ namespace bda int lmem_per_work_group = 0; bool pattern_uploaded = false; + FGPILU fgpilu; + + void chow_patel_decomposition(); + public: BILU0(ILUReorder opencl_ilu_reorder, int verbosity); diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 71e797450..39d403cb7 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -58,6 +58,8 @@ BdaBridge::BdaBridge(std::string gpu_mod ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; } else if (opencl_ilu_reorder == "graph_coloring") { ilu_reorder = bda::ILUReorder::GRAPH_COLORING; + } else if (opencl_ilu_reorder == "none") { + ilu_reorder = bda::ILUReorder::NONE; } else { OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'"); } diff --git a/opm/simulators/linalg/bda/BlockedMatrix.cpp b/opm/simulators/linalg/bda/BlockedMatrix.cpp index f4ffc0706..050a17a95 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.cpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp @@ -93,11 +93,23 @@ void blockMult(double *mat1, double *mat2, double *resMat) { } } +// subtract c from b and store in a +// a = b - c +template +void blockSub(double *a, double *b, double *c) +{ + for (unsigned int row = 0; row < block_size; row++) { + for (unsigned int col = 0; col < block_size; col++) { + a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col]; + } + } +} #define INSTANTIATE_BDA_FUNCTIONS(n) \ template void sortBlockedRow(int *, double *, int, int); \ template void blockMultSub(double *, double *, double *); \ template void blockMult(double *, double *, double *); \ +template void blockSub(double *, double *, double *); \ INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); diff --git a/opm/simulators/linalg/bda/BlockedMatrix.hpp b/opm/simulators/linalg/bda/BlockedMatrix.hpp index 39a573bb5..80488a8dc 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.hpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp @@ -109,6 +109,14 @@ void blockMultSub(double *a, double *b, double *c); template void blockMult(double *mat1, double *mat2, double *resMat); +/// Subtract blocks +/// a = b - c +/// \param[out] a result block +/// \param[in] b input block +/// \param[in] c input block +template +void blockSub(double *a, double *b, double *c); + } // end namespace bda #endif diff --git a/opm/simulators/linalg/bda/ILUReorder.hpp b/opm/simulators/linalg/bda/ILUReorder.hpp index 83039cf6d..b7ea06ce2 100644 --- a/opm/simulators/linalg/bda/ILUReorder.hpp +++ b/opm/simulators/linalg/bda/ILUReorder.hpp @@ -27,7 +27,8 @@ namespace bda enum class ILUReorder { LEVEL_SCHEDULING, - GRAPH_COLORING + GRAPH_COLORING, + NONE }; } diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp new file mode 100644 index 000000000..508fcafe0 --- /dev/null +++ b/opm/simulators/linalg/bda/fgpilu.cpp @@ -0,0 +1,570 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + + +#include +#include +#include + +#include + +namespace bda +{ + + using Opm::OpmLog; + +// if PARALLEL is 0: +// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially. +// Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously, +// except when the number of blocks in that row exceeds the number of workitems per workgroup. +// In that case some workitems will process multiple blocks sequentially. +// else: +// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially +// Each block in a row gets a warp of 32 workitems, of which 9 are always active. +// Multiple blocks can be processed in parallel if a workgroup contains multiple warps. +// If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially. + +// Notes: +// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32. +// PARALLEL 1 should be run with at least 32 workitems per workgroup. +// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup. +// PARALLEL 0 is generally faster, despite not having parallelization. + +#define PARALLEL 0 + +#if PARALLEL + +inline const char* fgpilu_sweep_s = R"( + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +void blockMultSub( + __local double * restrict a, + __global const double * restrict b, + __global const double * restrict c) +{ + const unsigned int block_size = 3; + const unsigned int warp_size = 32; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) + if(thread_id_in_warp < block_size * block_size){ + const unsigned int row = thread_id_in_warp / block_size; + const unsigned int col = thread_id_in_warp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += b[block_size * row + k] * c[block_size * k + col]; + } + a[block_size * row + col] -= temp; + } +} + + +void blockMult( + __local const double * restrict mat1, + __local const double * restrict mat2, + __global double * restrict resMat) +{ + const unsigned int block_size = 3; + const unsigned int warp_size = 32; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) + if(thread_id_in_warp < block_size * block_size){ + const unsigned int row = thread_id_in_warp / block_size; + const unsigned int col = thread_id_in_warp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += mat1[block_size * row + k] * mat2[block_size * k + col]; + } + resMat[block_size * row + col] = temp; + } +} + + +void invert( + __global const double * restrict matrix, + __local double * restrict inverse) +{ + const unsigned int block_size = 3; + const unsigned int bs = block_size; + const unsigned int warp_size = 32; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) + if(thread_id_in_warp < block_size * block_size){ + // code generated by maple, copied from Dune::DenseMatrix + double t4 = matrix[0] * matrix[4]; + double t6 = matrix[0] * matrix[5]; + double t8 = matrix[1] * matrix[3]; + double t10 = matrix[2] * matrix[3]; + double t12 = matrix[1] * matrix[6]; + double t14 = matrix[2] * matrix[6]; + + double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + + t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); + double t17 = 1.0 / det; + + const unsigned int r = thread_id_in_warp / block_size; + const unsigned int c = thread_id_in_warp % block_size; + const unsigned int r1 = (r+1) % bs; + const unsigned int c1 = (c+1) % bs; + const unsigned int r2 = (r+bs-1) % bs; + const unsigned int c2 = (c+bs-1) % bs; + inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17; + } +} + +__kernel void fgpilu_sweep( + __global const double * restrict Ut_vals, + __global const double * restrict L_vals, + __global const double * restrict LU_vals, + __global const int * restrict Ut_rows, + __global const int * restrict L_rows, + __global const int * restrict LU_rows, + __global const int * restrict Ut_cols, + __global const int * restrict L_cols, + __global const int * restrict LU_cols, + __global double * restrict Ltmp, + __global double * restrict Utmp, + const int Nb, + __local double *aij, + __local double *ujj) +{ + const int bs = 3; + + // for every row + const unsigned int warp_size = 32; + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int num_groups = get_num_groups(0); + const unsigned int warps_per_group = bsize / warp_size; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) + const unsigned int warp_id_in_group = idx_t / warp_size; + const unsigned int lmem_offset = warp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it + for (int row = idx_b; row < Nb; row+=num_groups) { + int jColStart = Ut_rows[row]; + int jColEnd = Ut_rows[row + 1]; + for (int ij = jColStart + warp_id_in_group; ij < jColEnd; ij+=warps_per_group) { + int col = Ut_cols[ij]; + // refine Uij element (or diagonal) + int i1 = LU_rows[col]; + int i2 = LU_rows[col+1]; + int kk = 0; + for(kk = i1; kk < i2; ++kk) { + int c = LU_cols[kk]; + if (c >= row) { + break; + } + } + + if(thread_id_in_warp < bs*bs){ + aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp]; + } + + int jk = L_rows[col]; + int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb; + + for (int k = jColStart; k < ij; ++k) { + int ki = Ut_cols[k]; + while (ik < ki) { + ++jk; + ik = L_cols[jk]; + } + if (ik == ki) { + blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs); + } + } + + if(thread_id_in_warp < bs*bs){ + Utmp[ij*bs*bs + thread_id_in_warp] = aij[lmem_offset + thread_id_in_warp]; + } + } + + // update L + int iRowStart = L_rows[row]; + int iRowEnd = L_rows[row + 1]; + + for (int ij = iRowStart + warp_id_in_group; ij < iRowEnd; ij+=warps_per_group) { + // for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) { + int j = L_cols[ij]; + // // refine Lij element + int i1 = LU_rows[row]; + int i2 = LU_rows[row+1]; + int kk = 0; + for(kk = i1; kk < i2; ++kk) { + int c = LU_cols[kk]; + if (c >= j) { + break; + } + } + + if(thread_id_in_warp < bs*bs){ + aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp]; + } + + int jk = Ut_rows[j]; + int ik = Ut_cols[jk]; + for (int k = iRowStart; k < ij; ++k) { + int ki = L_cols[k]; + while(ik < ki) { + ++jk; + ik = Ut_cols[jk]; + } + + if(ik == ki) { + blockMultSub(aij+lmem_offset, L_vals + k * bs * bs , Ut_vals + jk * bs * bs); + } + } + + // calculate aij / ujj + invert(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+lmem_offset); + + // lij = aij / ujj + blockMult(aij+lmem_offset, ujj+lmem_offset, Ltmp + ij * bs * bs); + } + } +} +)"; + +#else + +inline const char* fgpilu_sweep_s = R"( + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +void blockMultSub( + __local double * restrict a, + __global const double * restrict b, + __global const double * restrict c) +{ + const unsigned int block_size = 3; + for (unsigned int row = 0; row < block_size; row++) { + for (unsigned int col = 0; col < block_size; col++) { + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += b[block_size * row + k] * c[block_size * k + col]; + } + a[block_size * row + col] -= temp; + } + } +} + + +void blockMult( + __local const double * restrict mat1, + __local const double * restrict mat2, + __global double * restrict resMat) +{ + const unsigned int block_size = 3; + for (unsigned int row = 0; row < block_size; row++) { + for (unsigned int col = 0; col < block_size; col++) { + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += mat1[block_size * row + k] * mat2[block_size * k + col]; + } + resMat[block_size * row + col] = temp; + } + } +} + + +__kernel void inverter( + __global const double * restrict matrix, + __local double * restrict inverse) +{ + // code generated by maple, copied from Dune::DenseMatrix + double t4 = matrix[0] * matrix[4]; + double t6 = matrix[0] * matrix[5]; + double t8 = matrix[1] * matrix[3]; + double t10 = matrix[2] * matrix[3]; + double t12 = matrix[1] * matrix[6]; + double t14 = matrix[2] * matrix[6]; + + double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + + t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); + double t17 = 1.0 / det; + + inverse[0] = (matrix[4] * matrix[8] - matrix[5] * matrix[7]) * t17; + inverse[1] = -(matrix[1] * matrix[8] - matrix[2] * matrix[7]) * t17; + inverse[2] = (matrix[1] * matrix[5] - matrix[2] * matrix[4]) * t17; + inverse[3] = -(matrix[3] * matrix[8] - matrix[5] * matrix[6]) * t17; + inverse[4] = (matrix[0] * matrix[8] - t14) * t17; + inverse[5] = -(t6 - t10) * t17; + inverse[6] = (matrix[3] * matrix[7] - matrix[4] * matrix[6]) * t17; + inverse[7] = -(matrix[0] * matrix[7] - t12) * t17; + inverse[8] = (t4 - t8) * t17; +} + +__kernel void fgpilu_sweep( + __global const double * restrict Ut_vals, + __global const double * restrict L_vals, + __global const double * restrict LU_vals, + __global const int * restrict Ut_rows, + __global const int * restrict L_rows, + __global const int * restrict LU_rows, + __global const int * restrict Ut_cols, + __global const int * restrict L_cols, + __global const int * restrict LU_cols, + __global double * restrict Ltmp, + __global double * restrict Utmp, + const int Nb, + __local double *aij, + __local double *ujj) +{ + const int bs = 3; + + // for every row + const unsigned int warp_size = 32; + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int num_groups = get_num_groups(0); + const unsigned int warps_per_group = bsize / warp_size; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) + const unsigned int warp_id_in_group = idx_t / warp_size; + for (int row = idx_b; row < Nb; row+=num_groups) { + int jColStart = Ut_rows[row]; + int jColEnd = Ut_rows[row + 1]; + for (int ij = jColStart + idx_t; ij < jColEnd; ij+=bsize) { + int col = Ut_cols[ij]; + // refine Uij element (or diagonal) + int i1 = LU_rows[col]; + int i2 = LU_rows[col+1]; + int kk = 0; + for(kk = i1; kk < i2; ++kk) { + int c = LU_cols[kk]; + if (c >= row) { + break; + } + } + + for(int z = 0; z < bs*bs; ++z){ + aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z]; + } + + int jk = L_rows[col]; + int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb; + + for (int k = jColStart; k < ij; ++k) { + int ki = Ut_cols[k]; + while (ik < ki) { + ++jk; + ik = L_cols[jk]; + } + if (ik == ki) { + blockMultSub(aij+idx_t*bs*bs, L_vals + jk * bs * bs, Ut_vals + k * bs * bs); + } + } + + for(int z = 0; z < bs*bs; ++z){ + Utmp[ij*bs*bs + z] = aij[idx_t*bs*bs+z]; + } + } + + // update L + int iRowStart = L_rows[row]; + int iRowEnd = L_rows[row + 1]; + + for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) { + int j = L_cols[ij]; + // // refine Lij element + int i1 = LU_rows[row]; + int i2 = LU_rows[row+1]; + int kk = 0; + for(kk = i1; kk < i2; ++kk) { + int c = LU_cols[kk]; + if (c >= j) { + break; + } + } + + for(int z = 0; z < bs*bs; ++z){ + aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z]; + } + + int jk = Ut_rows[j]; + int ik = Ut_cols[jk]; + for (int k = iRowStart; k < ij; ++k) { + int ki = L_cols[k]; + while(ik < ki) { + ++jk; + ik = Ut_cols[jk]; + } + + if(ik == ki) { + blockMultSub(aij+idx_t*bs*bs, L_vals + k * bs * bs , Ut_vals + jk * bs * bs); + } + } + // calculate aij / ujj + inverter(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+idx_t*bs*bs); + + // lij = aij / ujj + blockMult(aij+idx_t*bs*bs, ujj+idx_t*bs*bs, Ltmp + ij * bs * bs); + } + } +} +)"; + +#endif + + + + + + + +void FGPILU::decomposition( + cl::CommandQueue *queue, cl::Context *context, + int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs, + int *L_rows, int *L_cols, double *L_vals, int L_nnzbs, + int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs, + int Nb, int num_sweeps, int verbosity) +{ + const int block_size = 3; + + try { + if (initialized == false) { + cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector >' + cl::Program program = cl::Program(*context, source, &err); + + std::vector devices = context->getInfo(); + program.build(devices); + + fgpilu_sweep_k.reset(new cl::make_kernel(cl::Kernel(program, "fgpilu_sweep", &err))); + + // allocate GPU memory + d_Ut_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size); + d_L_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size); + d_LU_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * LU_nnzbs * block_size * block_size); + d_Ut_ptrs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1)); + d_L_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1)); + d_LU_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1)); + d_Ut_idxs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Ut_nnzbs); + d_L_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * L_nnzbs); + d_LU_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LU_nnzbs); + d_Ltmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size); + d_Utmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size); + + Dune::Timer t_copy_pattern; + err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_TRUE, 0, sizeof(int) * (Nb+1), Ut_ptrs); + err |= queue->enqueueWriteBuffer(d_L_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), L_rows); + err |= queue->enqueueWriteBuffer(d_LU_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), LU_rows); + err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_TRUE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs); + err |= queue->enqueueWriteBuffer(d_L_cols, CL_TRUE, 0, sizeof(int) * L_nnzbs, L_cols); + err |= queue->enqueueWriteBuffer(d_LU_cols, CL_TRUE, 0, sizeof(int) * LU_nnzbs, LU_cols); + if (verbosity >= 3){ + std::ostringstream out; + out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s"; + OpmLog::info(out.str()); + } + if (verbosity >= 2){ + std::ostringstream out; + out << "FGPILU PARALLEL: " << PARALLEL; + OpmLog::info(out.str()); + } + + initialized = true; + } + + + // copy to GPU + Dune::Timer t_copy1; + err = queue->enqueueWriteBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals); + err |= queue->enqueueWriteBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals); + err |= queue->enqueueWriteBuffer(d_LU_vals, CL_TRUE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals); + if (verbosity >= 3){ + std::ostringstream out; + out << "FGPILU copy1 time: " << t_copy1.stop() << " s"; + OpmLog::info(out.str()); + } + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueWriteBuffer error"); + } + + // call kernel + for (int sweep = 0; sweep < num_sweeps; ++sweep) { + // normally, L_vals and Ltmp are swapped after the sweep is done + // these conditionals implement that without actually swapping pointers + // 1st sweep reads X_vals, writes to Xtmp + // 2nd sweep reads Xtmp, writes to X_vals + auto *Larg1 = (sweep % 2 == 0) ? &d_L_vals : &d_Ltmp; + auto *Larg2 = (sweep % 2 == 0) ? &d_Ltmp : &d_L_vals; + auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp; + auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals; + int num_work_groups = Nb; +#if PARALLEL + int work_group_size = 32; +#else + int work_group_size = 16; +#endif + int total_work_items = num_work_groups * work_group_size; + int lmem_per_work_group = work_group_size * block_size * block_size * sizeof(double); + Dune::Timer t_kernel; + event = (*fgpilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + *Uarg1, *Larg1, d_LU_vals, + d_Ut_ptrs, d_L_rows, d_LU_rows, + d_Ut_idxs, d_L_cols, d_LU_cols, + *Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group)); + event.wait(); + if (verbosity >= 3){ + std::ostringstream out; + out << "FGPILU sweep kernel time: " << t_copy1.stop() << " s"; + OpmLog::info(out.str()); + } + } + + // copy back + Dune::Timer t_copy2; + if (num_sweeps % 2 == 0) { + err = queue->enqueueReadBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals); + err |= queue->enqueueReadBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals); + } else { + err = queue->enqueueReadBuffer(d_Utmp, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals); + err |= queue->enqueueReadBuffer(d_Ltmp, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals); + } + err |= queue->enqueueBarrierWithWaitList(); + if (verbosity >= 3){ + std::ostringstream out; + out << "FGPILU copy2 time: " << t_copy2.stop() << " s"; + OpmLog::info(out.str()); + } + if (err != CL_SUCCESS) { + // enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueReadBuffer error"); + } + + } catch (const cl::Error& error) { + std::ostringstream oss; + oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; + oss << getErrorString(error.err()) << std::endl; + // rethrow exception + OPM_THROW(std::logic_error, oss.str()); + } catch (const std::logic_error& error) { + // rethrow exception by OPM_THROW in the try{} + throw error; + } +} + + +} // end namespace bda + diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/fgpilu.hpp new file mode 100644 index 000000000..d51c21061 --- /dev/null +++ b/opm/simulators/linalg/bda/fgpilu.hpp @@ -0,0 +1,86 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef FGPILU_HEADER_INCLUDED +#define FGPILU_HEADER_INCLUDED + + +#include + + + +namespace bda +{ + + // This class implements a blocked version on GPU of the Fine-Grained Parallel ILU (FGPILU) by Chow and Patel 2015: + // FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 + // only blocksize == 3 is supported + // decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically + class FGPILU + { + private: + cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals; + cl::Buffer d_Ut_ptrs, d_Ut_idxs; + cl::Buffer d_L_rows, d_L_cols; + cl::Buffer d_LU_rows, d_LU_cols; + cl::Buffer d_Ltmp, d_Utmp; + + cl::Event event; + cl_int err; + + std::unique_ptr > fgpilu_sweep_k; + + bool initialized = false; + + public: + /// Executes the FGPILU sweeps + /// also copies data from CPU to GPU and GPU to CPU + /// \param[in] queue OpenCL commandqueue + /// \param[in] context OpenCL context + /// \param[in] Ut_ptrs BSC columnpointers + /// \param[in] Ut_idxs BSC rowindices + /// \param[inout] Ut_vals actual nonzeros for U + /// \param[in] Ut_nnzbs number of blocks in U + /// \param[in] L_rows BSR rowpointers + /// \param[in] L_cols BSR columnindices + /// \param[inout] L_vals actual nonzeroes for L + /// \param[in] L_nnzbs number of blocks in L + /// \param[in] LU_rows BSR rowpointers + /// \param[in] LU_cols BSR columnindices + /// \param[in] LU_vals actual nonzeroes for LU (original matrix) + /// \param[in] LU_nnzbs number of blocks in LU + /// \param[in] Nb number of blockrows + /// \param[in] num_sweeps number of sweeps to be done + /// \param[in] verbosity print verbosity + void decomposition( + cl::CommandQueue *queue, cl::Context *context, + int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs, + int *L_rows, int *L_cols, double *L_vals, int L_nnzbs, + int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs, + int Nb, int num_sweeps, int verbosity); + + }; + +} // end namespace bda + +#endif // FGPILU_HEADER_INCLUDED diff --git a/opm/simulators/linalg/bda/opencl.cpp b/opm/simulators/linalg/bda/opencl.cpp index 9108a189e..5030d6a6f 100644 --- a/opm/simulators/linalg/bda/opencl.cpp +++ b/opm/simulators/linalg/bda/opencl.cpp @@ -88,6 +88,9 @@ std::string getErrorString(cl_int error) case -67: return "CL_INVALID_LINKER_OPTIONS"; case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + // vendor specific errors + case -9999: return "NVIDIA_OPENCL_ILLEGAL_READ_OR_WRITE_TO_BUFFER"; + default: return "UNKNOWN_CL_CODE"; } } From 71692ff52b61741c60b1c9e83a430a4bbdc7b3d3 Mon Sep 17 00:00:00 2001 From: tqiu Date: Thu, 17 Dec 2020 15:50:53 +0100 Subject: [PATCH 02/14] Disabled FGPILU by default --- opm/simulators/linalg/bda/BILU0.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index dedb7d222..c21c658a1 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -37,7 +37,7 @@ namespace bda // FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 // if CHOW_PATEL_GPU is 0, the decomposition is done on CPU // if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU -#define CHOW_PATEL 1 +#define CHOW_PATEL 0 #define CHOW_PATEL_GPU 1 using Opm::OpmLog; From f26e58c6ca4e8c10519e1960d8f9a3c12acce3c0 Mon Sep 17 00:00:00 2001 From: tqiu Date: Mon, 11 Jan 2021 16:51:05 +0100 Subject: [PATCH 03/14] Reordering is now actually skipped when NONE is chosen. Also made reordering optional for openclSolver. --- opm/simulators/linalg/bda/BILU0.cpp | 75 +++++++++++-------- .../bda/MultisegmentWellContribution.hpp | 1 + .../linalg/bda/openclSolverBackend.cpp | 42 ++++++++--- .../linalg/bda/openclSolverBackend.hpp | 7 +- 4 files changed, 80 insertions(+), 45 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index c21c658a1..165106b50 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -53,8 +53,11 @@ namespace bda { delete[] invDiagVals; delete[] diagIndex; - delete[] toOrder; - delete[] fromOrder; + if (opencl_ilu_reorder != ILUReorder::NONE) { + delete[] toOrder; + delete[] fromOrder; + } + delete[] LUmat->nnzValues; } template @@ -67,23 +70,29 @@ namespace bda this->nnz = mat->nnzbs * block_size * block_size; this->nnzbs = mat->nnzbs; - toOrder = new int[Nb]; - fromOrder = new int[Nb]; + int *CSCRowIndices = nullptr; + int *CSCColPointers = nullptr; - int *CSCRowIndices = new int[nnzbs]; - int *CSCColPointers = new int[Nb + 1]; + if (opencl_ilu_reorder == ILUReorder::NONE) { + LUmat = std::make_unique >(*mat); + } else { + toOrder = new int[Nb]; + fromOrder = new int[Nb]; + CSCRowIndices = new int[nnzbs]; + CSCColPointers = new int[Nb + 1]; + rmat = std::make_shared >(mat->Nb, mat->nnzbs); + LUmat = std::make_unique >(*rmat); - Timer t_convert; - csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb); - if(verbosity >= 3){ - std::ostringstream out; - out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s"; - OpmLog::info(out.str()); + Timer t_convert; + csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb); + if(verbosity >= 3){ + std::ostringstream out; + out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s"; + OpmLog::info(out.str()); + } } Timer t_analysis; - rmat = std::make_shared >(mat->Nb, mat->nnzbs); - LUmat = std::make_unique >(*rmat); std::ostringstream out; if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { out << "BILU0 reordering strategy: " << "level_scheduling\n"; @@ -93,15 +102,11 @@ namespace bda findGraphColoring(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor); } else if (opencl_ilu_reorder == ILUReorder::NONE) { out << "BILU0 reordering strategy: none\n"; - numColors = 1; - rowsPerColor.emplace_back(Nb); - // numColors = Nb; - // for(int i = 0; i < Nb; ++i){ - // rowsPerColor.emplace_back(1); - // } + // numColors = 1; + // rowsPerColor.emplace_back(Nb); + numColors = Nb; for(int i = 0; i < Nb; ++i){ - toOrder[i] = i; - fromOrder[i] = i; + rowsPerColor.emplace_back(1); } } else { OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n"); @@ -114,8 +119,10 @@ namespace bda } OpmLog::info(out.str()); - delete[] CSCRowIndices; - delete[] CSCColPointers; + if (opencl_ilu_reorder != ILUReorder::NONE) { + delete[] CSCRowIndices; + delete[] CSCColPointers; + } diagIndex = new int[mat->Nb]; invDiagVals = new double[mat->Nb * bs * bs]; @@ -385,18 +392,24 @@ namespace bda bool BILU0::create_preconditioner(BlockedMatrix *mat) { const unsigned int bs = block_size; - Timer t_reorder; - reorderBlockedMatrixByPattern(mat, toOrder, fromOrder, rmat.get()); + auto *m = mat; - if (verbosity >= 3){ - std::ostringstream out; - out << "BILU0 reorder matrix: " << t_reorder.stop() << " s"; - OpmLog::info(out.str()); + if (opencl_ilu_reorder != ILUReorder::NONE) { + m = rmat.get(); + Timer t_reorder; + reorderBlockedMatrixByPattern(mat, toOrder, fromOrder, rmat.get()); + + if (verbosity >= 3){ + std::ostringstream out; + out << "BILU0 reorder matrix: " << t_reorder.stop() << " s"; + OpmLog::info(out.str()); + } } // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp + // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy Timer t_copy; - memcpy(LUmat->nnzValues, rmat->nnzValues, sizeof(double) * bs * bs * rmat->nnzbs); + memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs); if (verbosity >= 3){ std::ostringstream out; diff --git a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp index 6b9e8fd67..3862b3840 100644 --- a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp +++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp @@ -126,6 +126,7 @@ public: /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect /// Those indices need to be mapped via toOrder /// \param[in] toOrder array with mappings + /// \param[in] reorder whether reordering is actually used or not void setReordering(int *toOrder, bool reorder); }; diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index d90c9ac6f..3c7aad43a 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -44,7 +44,7 @@ using Opm::OpmLog; using Dune::Timer; template -openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_) { +openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) { prec = new Preconditioner(opencl_ilu_reorder, verbosity_); cl_int err = CL_SUCCESS; @@ -485,7 +485,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub prec->setOpenCLContext(context.get()); prec->setOpenCLQueue(queue.get()); - rb = new double[N]; tmp = new double[N]; #if COPY_ROW_BY_ROW vals_contiguous = new double[N]; @@ -508,7 +507,12 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); - d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); + bool reorder = (opencl_ilu_reorder != ILUReorder::NONE); + if (reorder) { + rb = new double[N]; + d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); + } + wcontainer->init(wellContribs, N, Nb, reorder); // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking @@ -544,7 +548,9 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub template void openclSolverBackend::finalize() { - delete[] rb; + if (opencl_ilu_reorder != ILUReorder::NONE) { + delete[] rb; + } delete[] tmp; #if COPY_ROW_BY_ROW delete[] vals_contiguous; @@ -572,7 +578,9 @@ void openclSolverBackend::copy_system_to_gpu() { queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices); queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers); queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb); - queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder); + if (opencl_ilu_reorder != ILUReorder::NONE) { + queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder); + } queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); @@ -624,9 +632,13 @@ bool openclSolverBackend::analyse_matrix() { int lmem_per_work_group = work_group_size * sizeof(double); prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group); - toOrder = prec->getToOrder(); - fromOrder = prec->getFromOrder(); - rmat = prec->getRMat(); + if (opencl_ilu_reorder == ILUReorder::NONE) { + rmat = mat.get(); + } else { + toOrder = prec->getToOrder(); + fromOrder = prec->getFromOrder(); + rmat = prec->getRMat(); + } if (verbosity > 2) { std::ostringstream out; @@ -645,7 +657,11 @@ void openclSolverBackend::update_system(double *vals, double *b) { Timer t; mat->nnzValues = vals; - reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); + if (opencl_ilu_reorder != ILUReorder::NONE) { + reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); + } else { + rb = b; + } if (verbosity > 2) { std::ostringstream out; @@ -703,8 +719,12 @@ template void openclSolverBackend::get_result(double *x) { Timer t; - queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb); - reorderBlockedVectorByPattern(mat->Nb, rb, toOrder, x); + if (opencl_ilu_reorder != ILUReorder::NONE) { + queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb); + reorderBlockedVectorByPattern(mat->Nb, rb, toOrder, x); + } else { + queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x); + } if (verbosity > 2) { std::ostringstream out; diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index 1c36fefcb..1a8dfa73b 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver using Base::initialized; private: - double *rb = nullptr; // reordered b vector, the matrix is reordered, so b must also be + double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp // OpenCL variables must be reusable, they are initialized in initialize() @@ -59,7 +59,7 @@ private: cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm() - cl::Buffer d_toOrder; + cl::Buffer d_toOrder; // only used when reordering is used double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm() // shared pointers are also passed to other objects @@ -81,7 +81,8 @@ private: int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings bool analysis_done = false; std::unique_ptr > mat = nullptr; // original matrix - BlockedMatrix *rmat = nullptr; // reordered matrix, used for spmv + BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv + ILUReorder opencl_ilu_reorder; // reordering strategy /// Divide A by B, and round up: return (int)ceil(A/B) /// \param[in] A dividend From 0f9ae3695d90f44476582eeab09fb2a7d8f1779c Mon Sep 17 00:00:00 2001 From: tqiu Date: Wed, 13 Jan 2021 11:54:40 +0100 Subject: [PATCH 04/14] Made reordering optional for WellContributions Removed unnecessary alloc --- opm/simulators/linalg/bda/BILU0.cpp | 3 - .../linalg/bda/WellContributions.cpp | 45 ++++++------ .../linalg/bda/WellContributions.hpp | 16 +++- opm/simulators/linalg/bda/openclKernels.hpp | 73 +++++++++++++++++++ .../linalg/bda/openclSolverBackend.cpp | 17 +++-- .../linalg/bda/openclSolverBackend.hpp | 7 +- 6 files changed, 124 insertions(+), 37 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 165106b50..e5f95ac87 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -57,7 +57,6 @@ namespace bda delete[] toOrder; delete[] fromOrder; } - delete[] LUmat->nnzValues; } template @@ -130,8 +129,6 @@ namespace bda Lmat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); Umat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); - LUmat->nnzValues = new double[mat->nnzbs * bs * bs]; - s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs); s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index 3eca058af..f17c53c3b 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -56,11 +56,6 @@ WellContributions::~WellContributions() if(num_std_wells > 0){ delete[] val_pointers; -#if HAVE_OPENCL - if(opencl_gpu){ - delete[] h_toOrder; - } -#endif } #if HAVE_OPENCL @@ -79,8 +74,15 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu this->queue = queue_; } -void WellContributions::setKernel(kernel_type *kernel_){ +void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){ this->kernel = kernel_; + this->kernel_no_reorder = kernel_no_reorder_; +} + +void WellContributions::setReordering(int *h_toOrder_, bool reorder_) +{ + this->h_toOrder = h_toOrder_; + this->reorder = reorder_; } void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ @@ -90,29 +92,24 @@ void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffe const unsigned int lmem2 = sizeof(double) * dim_wells; cl::Event event; - event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, - cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + if (reorder) { + event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + } else { + event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + } + event.wait(); } -void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ +void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){ if(h_x == nullptr){ h_x = new double[N]; h_y = new double[N]; } - if(h_toOrder == nullptr){ - h_toOrder = new int[Nb]; - } - - if(!read_toOrder){ - events.resize(1); - queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]); - events[0].wait(); - events.clear(); - read_toOrder = true; - } - events.resize(2); queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]); queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]); @@ -121,7 +118,7 @@ void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer // actually apply MultisegmentWells for(Opm::MultisegmentWellContribution *well: multisegments){ - well->setReordering(h_toOrder, true); + well->setReordering(h_toOrder, reorder); well->apply(h_x, h_y); } @@ -138,7 +135,7 @@ void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrd } if(num_ms_wells > 0){ - apply_mswells(d_x, d_y, d_toOrder); + apply_mswells(d_x, d_y); } } #endif diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index 7267fe1f0..f3f8e3fd2 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -95,17 +95,21 @@ private: typedef cl::make_kernel kernel_type; + typedef cl::make_kernel kernel_type_no_reorder; cl::Context *context; cl::CommandQueue *queue; kernel_type *kernel; + kernel_type_no_reorder *kernel_no_reorder; std::vector events; std::unique_ptr d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl; std::unique_ptr d_Ccols_ocl, d_Bcols_ocl; std::unique_ptr d_val_pointers_ocl; - bool read_toOrder = false; + bool reorder = false; int *h_toOrder = nullptr; #endif @@ -150,10 +154,16 @@ public: #endif #if HAVE_OPENCL - void setKernel(kernel_type *kernel_); + void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_); void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_); + + /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect + /// Those indices need to be mapped via toOrder + /// \param[in] toOrder array with mappings + /// \param[in] reorder whether reordering is actually used or not + void setReordering(int *toOrder, bool reorder); void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); - void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); + void apply_mswells(cl::Buffer d_x, cl::Buffer d_y); void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); #endif diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index 021905e18..e3ffadf60 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -440,6 +440,79 @@ namespace bda } )"; + inline const char* stdwell_apply_no_reorder_s = R"( + __kernel void stdwell_apply_no_reorder(__global const double *Cnnzs, + __global const double *Dnnzs, + __global const double *Bnnzs, + __global const int *Ccols, + __global const int *Bcols, + __global const double *x, + __global double *y, + const unsigned int dim, + const unsigned int dim_wells, + __global const unsigned int *val_pointers, + __local double *localSum, + __local double *z1, + __local double *z2){ + int wgId = get_group_id(0); + int wiId = get_local_id(0); + int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; + int valsPerBlock = dim*dim_wells; + int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; + int numBlocksPerWarp = get_local_size(0)/valsPerBlock; + int c = wiId % dim; + int r = (wiId/dim) % dim_wells; + double temp; + + barrier(CLK_LOCAL_MEM_FENCE); + + localSum[wiId] = 0; + if(wiId < numActiveWorkItems){ + int b = wiId/valsPerBlock + val_pointers[wgId]; + while(b < valSize + val_pointers[wgId]){ + int colIdx = Bcols[b]; + localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; + b += numBlocksPerWarp; + } + + if(wiId < valsPerBlock){ + localSum[wiId] += localSum[wiId + valsPerBlock]; + } + + b = wiId/valsPerBlock + val_pointers[wgId]; + + if(c == 0 && wiId < valsPerBlock){ + for(unsigned int stride = 2; stride > 0; stride >>= 1){ + localSum[wiId] += localSum[wiId + stride]; + } + z1[r] = localSum[wiId]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim_wells){ + temp = 0.0; + for(unsigned int i = 0; i < dim_wells; ++i){ + temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; + } + z2[wiId] = temp; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim*valSize){ + temp = 0.0; + int bb = wiId/dim + val_pointers[wgId]; + for (unsigned int j = 0; j < dim_wells; ++j){ + temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; + } + int colIdx = Ccols[bb]; + y[colIdx*dim + c] -= temp; + } + } + )"; + } // end namespace bda #endif diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 3c7aad43a..5e10ad8ec 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -318,7 +317,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr double norm, norm_0; if(wellContribs.getNumWells() > 0){ - wellContribs.setKernel(stdwell_apply_k.get()); + wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get()); } Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); @@ -479,6 +478,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s))); source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s))); source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s))); + source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s))); program = cl::Program(*context, source); program.build(devices); @@ -512,7 +512,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub rb = new double[N]; d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); } - wcontainer->init(wellContribs, N, Nb, reorder); // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking @@ -529,6 +528,10 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply"))); + stdwell_apply_no_reorder_k.reset(new cl::make_kernel(cl::Kernel(program, "stdwell_apply_no_reorder"))); prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); @@ -653,14 +656,16 @@ bool openclSolverBackend::analyse_matrix() { template -void openclSolverBackend::update_system(double *vals, double *b) { +void openclSolverBackend::update_system(double *vals, double *b, WellContributions &wellContribs) { Timer t; mat->nnzValues = vals; if (opencl_ilu_reorder != ILUReorder::NONE) { reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); + wellContribs.setReordering(toOrder, true); } else { rb = b; + wellContribs.setReordering(nullptr, false); } if (verbosity > 2) { @@ -743,13 +748,13 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; } } - update_system(vals, b); + update_system(vals, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } copy_system_to_gpu(); } else { - update_system(vals, b); + update_system(vals, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index 1a8dfa73b..aae527643 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -76,6 +76,10 @@ private: cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k; + std::shared_ptr > stdwell_apply_no_reorder_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings @@ -152,7 +156,8 @@ private: /// Reorder the linear system so it corresponds with the coloring /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] b input vectors, contains N values - void update_system(double *vals, double *b); + /// \param[out] wellContribs WellContributions, to set reordering + void update_system(double *vals, double *b, WellContributions &wellContribs); /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same void update_system_on_gpu(); From a8e524fc9d97ddd6bed18551e08e1ce2418bce46 Mon Sep 17 00:00:00 2001 From: tqiu Date: Mon, 18 Jan 2021 14:11:07 +0100 Subject: [PATCH 05/14] Minor C to C++ changes --- opm/simulators/linalg/bda/BILU0.cpp | 33 +++++++++++++---------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index e5f95ac87..40449cb52 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -167,7 +167,6 @@ namespace bda // split matrix into L and U // also convert U into BSC format (Ut) // Ut stores diagonal for now - int num_blocks_L = 0; // Ut is actually BSC format std::unique_ptr > Ut = std::make_unique >(Umat->Nb, Umat->nnzbs + Umat->Nb); @@ -177,6 +176,7 @@ namespace bda Ut->rowPointers[i] = 0; } + int num_blocks_L = 0; // for every row for (int i = 0; i < Nb; i++) { int iRowStart = LUmat->rowPointers[i]; @@ -188,19 +188,16 @@ namespace bda Ut->rowPointers[j+1]++; // actually colPointers } else { Lmat->colIndices[num_blocks_L] = j; - memcpy(Lmat->nnzValues + num_blocks_L * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs); + std::copy(LUmat->nnzValues + ij * bs * bs, LUmat->nnzValues + (ij+1) * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs); num_blocks_L++; } } + // TODO: copy all blocks for L at once, instead of copying each block individually Lmat->rowPointers[i+1] = num_blocks_L; } // prefix sum - int sum = 0; - for (int i = 1; i < Nb+1; i++) { - sum += Ut->rowPointers[i]; - Ut->rowPointers[i] = sum; - } + std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers); // for every row for (int i = 0; i < Nb; i++) { @@ -317,13 +314,17 @@ namespace bda } } - double *t = Lmat->nnzValues; - Lmat->nnzValues = Ltmp; - Ltmp = t; - t = Ut->nnzValues; - Ut->nnzValues = Utmp; - Utmp = t; + // 1st sweep writes to Ltmp + // 2nd sweep writes to Lmat->nnzValues + std::swap(Lmat->nnzValues, Ltmp); + std::swap(Ut->nnzValues, Utmp); } // end sweep + + // if number of sweeps is odd, swap again so data is in Lmat->nnzValues + if (num_sweeps % 2 == 1) { + std::swap(Lmat->nnzValues, Ltmp); + std::swap(Ut->nnzValues, Utmp); + } delete[] Ltmp; #endif @@ -346,11 +347,7 @@ namespace bda } // prefix sum - int sumU = 0; - for (int i = 1; i < Nb+1; i++) { - sumU += ptr[i]; - ptr[i] = sumU; - } + std::partial_sum(ptr.begin(), ptr.end(), ptr.begin()); // actually copy nonzero values for U for(int i = 0; i < Nb; ++i) { From 123e3fa89e653ccce5b5382d40a947b230badfbd Mon Sep 17 00:00:00 2001 From: tqiu Date: Mon, 18 Jan 2021 17:10:46 +0100 Subject: [PATCH 06/14] Use std::find and added comments --- opm/simulators/linalg/bda/BILU0.cpp | 94 +++++++++++++++++------------ 1 file changed, 57 insertions(+), 37 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 40449cb52..b79964d65 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -238,80 +238,100 @@ namespace bda double *Ltmp = new double[Lmat->nnzbs * block_size * block_size]; for (int sweep = 0; sweep < num_sweeps; ++sweep) { + // algorithm + // for every block in A (LUmat): + // if i > j: + // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj + // else: + // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj}) + // for every row for (int row = 0; row < Nb; row++) { // update U + // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj}) int jColStart = Ut->rowPointers[row]; int jColEnd = Ut->rowPointers[row + 1]; + int colU = row; // rename for clarity, next row in Ut means next col in U // for every block in this row for (int ij = jColStart; ij < jColEnd; ij++) { - int col = Ut->colIndices[ij]; + int rowU1 = Ut->colIndices[ij]; // actually rowIndices for U // refine Uij element (or diagonal) - int i1 = LUmat->rowPointers[col]; - int i2 = LUmat->rowPointers[col+1]; - int kk = 0; - for(kk = i1; kk < i2; ++kk) { - ptrdiff_t c = LUmat->colIndices[kk]; - if (c >= row) { - break; - } - } + int i1 = LUmat->rowPointers[rowU1]; + int i2 = LUmat->rowPointers[rowU1+1]; + + // search on row rowU1, find blockIndex in LUmat of block with same col (colU) as Uij + // LUmat->nnzValues[kk] is block Aij + auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, colU); + assert(candidate != LUmat->colIndices + i2); + auto kk = candidate - LUmat->colIndices; + double aij[bs*bs]; + // copy block to Aij so operations can be done on it without affecting LUmat memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs); - int jk = Lmat->rowPointers[col]; - int ik = (jk < Lmat->rowPointers[col+1]) ? Lmat->colIndices[jk] : Nb; - for (int k = jColStart; k < ij; ++k) { - int ki = Ut->colIndices[k]; - while (ik < ki) { - ++jk; - ik = Lmat->colIndices[jk]; - } - if (ik == ki) { - blockMultSub(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs); + int jk = Lmat->rowPointers[rowU1]; // points to row rowU1 in L + // if row rowU1 is empty, skip row + if (jk < Lmat->rowPointers[rowU1+1]) { + int colL = Lmat->colIndices[jk]; + for (int k = jColStart; k < ij; ++k) { + int rowU2 = Ut->colIndices[k]; + while (colL < rowU2) { + ++jk; // check next block on row rowU1 of L + colL = Lmat->colIndices[jk]; + } + if (colL == rowU2) { + // Aij -= (Lik * Ukj) + blockMultSub(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs); + } } } + // Uij_new = Aij - sum memcpy(Utmp + ij * bs * bs, &aij[0], sizeof(double) * bs * bs); } // update L + // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj int iRowStart = Lmat->rowPointers[row]; int iRowEnd = Lmat->rowPointers[row + 1]; for (int ij = iRowStart; ij < iRowEnd; ij++) { int j = Lmat->colIndices[ij]; // refine Lij element + // search on row 'row', find blockIndex in LUmat of block with same col (j) as Lij + // LUmat->nnzValues[kk] is block Aij int i1 = LUmat->rowPointers[row]; int i2 = LUmat->rowPointers[row+1]; - int kk = 0; - for(kk = i1; kk < i2; ++kk) { - ptrdiff_t c = LUmat->colIndices[kk]; - if (c >= j) { - break; - } - } + + auto candidate = std::find(LUmat->colIndices + i1, LUmat->colIndices + i2, j); + assert(candidate != LUmat->colIndices + i2); + auto kk = candidate - LUmat->colIndices; + double aij[bs*bs]; + // copy block to Aij so operations can be done on it without affecting LUmat memcpy(&aij[0], LUmat->nnzValues + kk * bs * bs, sizeof(double) * bs * bs); - int jk = Ut->rowPointers[j]; - int ik = Ut->colIndices[jk]; + + int jk = Ut->rowPointers[j]; // actually colPointers, jk points to col j in U + int rowU = Ut->colIndices[jk]; // actually rowIndices, rowU is the row of block jk + // check if L has a matching block where colL == rowU for (int k = iRowStart; k < ij; ++k) { - int ki = Lmat->colIndices[k]; - while(ik < ki) { - ++jk; - ik = Ut->colIndices[jk]; + int colL = Lmat->colIndices[k]; + while (rowU < colL) { + ++jk; // check next block on col j of U + rowU = Ut->colIndices[jk]; } - if(ik == ki) { + if (rowU == colL) { + // Aij -= (Lik * Ukj) blockMultSub(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs); } } - // calculate aij / ujj + + // calculate (Aij - sum) / Ujj double ujj[bs*bs]; inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]); - // lij = aij / ujj + // Lij_new = (Aij - sum) / Ujj blockMult(&aij[0], &ujj[0], Ltmp + ij * bs * bs); - } } // 1st sweep writes to Ltmp From 48ebef7808fbf60326a13cd03a1ede7dde16b81e Mon Sep 17 00:00:00 2001 From: tqiu Date: Tue, 19 Jan 2021 11:06:39 +0100 Subject: [PATCH 07/14] Updated comments --- opm/simulators/linalg/bda/BILU0.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index b79964d65..b1b11ce01 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -185,7 +185,7 @@ namespace bda for (int ij = iRowStart; ij < iRowEnd; ij++) { int j = LUmat->colIndices[ij]; if (i <= j) { - Ut->rowPointers[j+1]++; // actually colPointers + Ut->rowPointers[j+1]++; // actually colPointers, now simply indicates how many blocks this col holds } else { Lmat->colIndices[num_blocks_L] = j; std::copy(LUmat->nnzValues + ij * bs * bs, LUmat->nnzValues + (ij+1) * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs); @@ -196,7 +196,7 @@ namespace bda Lmat->rowPointers[i+1] = num_blocks_L; } - // prefix sum + // prefix sum to sum rowsizes into colpointers std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers); // for every row @@ -207,7 +207,7 @@ namespace bda for (int ij = iRowStart; ij < iRowEnd; ij++) { int j = LUmat->colIndices[ij]; if (i <= j){ - int idx = Ut->rowPointers[j]++; + int idx = Ut->rowPointers[j]++; // rowPointers[i] is (mis)used as the write offset of the current row i Ut->colIndices[idx] = i; // actually rowIndices memcpy(Ut->nnzValues + idx * bs * bs, LUmat->nnzValues + ij * bs * bs, sizeof(double) * bs * bs); } @@ -216,6 +216,7 @@ namespace bda // rotate // the Ut->rowPointers were increased in the last loop + // now Ut->rowPointers[i+1] is at the same position as Ut->rowPointers[i] should have for a crs matrix. reset to correct expected value for (int i = Nb; i > 0; --i) { Ut->rowPointers[i] = Ut->rowPointers[i-1]; } @@ -223,7 +224,8 @@ namespace bda Opm::Detail::Inverter inverter; - // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed at the end + // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition + // U will be reversed because it is used with backwards substitution, the last row is used first // Ltmp is only needed for CPU decomposition, GPU creates GPU buffer for Ltmp double *Utmp = new double[Ut->nnzbs * block_size * block_size]; @@ -273,6 +275,7 @@ namespace bda // if row rowU1 is empty, skip row if (jk < Lmat->rowPointers[rowU1+1]) { int colL = Lmat->colIndices[jk]; + // for every block on colU for (int k = jColStart; k < ij; ++k) { int rowU2 = Ut->colIndices[k]; while (colL < rowU2) { @@ -385,7 +388,7 @@ namespace bda std::rotate(ptr.begin(), ptr.end() - 1, ptr.end()); ptr.front() = 0; - // reversing the rows of U, because that is the order they are used in + // reversing the rows of U, because that is the order they are used in during ILU apply int URowIndex = 0; int offsetU = 0; // number of nnz blocks that are already copied to Umat Umat->rowPointers[0] = 0; From dba20adf0424d5f7dbace67086b6454025a61a19 Mon Sep 17 00:00:00 2001 From: tqiu Date: Wed, 20 Jan 2021 11:00:30 +0100 Subject: [PATCH 08/14] Bugfix: extra swap was done on odd number of sweeps --- opm/simulators/linalg/bda/BILU0.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index b1b11ce01..051f61fbf 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -343,8 +343,8 @@ namespace bda std::swap(Ut->nnzValues, Utmp); } // end sweep - // if number of sweeps is odd, swap again so data is in Lmat->nnzValues - if (num_sweeps % 2 == 1) { + // if number of sweeps is even, swap again so data is in Lmat->nnzValues + if (num_sweeps % 2 == 0) { std::swap(Lmat->nnzValues, Ltmp); std::swap(Ut->nnzValues, Utmp); } From 1e09b1f4d90b3cff3e576e6b40d57df5ad162a1b Mon Sep 17 00:00:00 2001 From: tqiu Date: Tue, 26 Jan 2021 13:45:40 +0100 Subject: [PATCH 09/14] Added comments and rewrote if --- opm/simulators/linalg/bda/BILU0.cpp | 4 +- opm/simulators/linalg/bda/fgpilu.cpp | 173 +++++++++++++++++---------- 2 files changed, 113 insertions(+), 64 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 051f61fbf..bdfdbe8b2 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -275,7 +275,7 @@ namespace bda // if row rowU1 is empty, skip row if (jk < Lmat->rowPointers[rowU1+1]) { int colL = Lmat->colIndices[jk]; - // for every block on colU + // only check until block U(i,j) is reached for (int k = jColStart; k < ij; ++k) { int rowU2 = Ut->colIndices[k]; while (colL < rowU2) { @@ -316,7 +316,7 @@ namespace bda int jk = Ut->rowPointers[j]; // actually colPointers, jk points to col j in U int rowU = Ut->colIndices[jk]; // actually rowIndices, rowU is the row of block jk - // check if L has a matching block where colL == rowU + // only check until block L(i,j) is reached for (int k = iRowStart; k < ij; ++k) { int colL = Lmat->colIndices[k]; while (rowU < colL) { diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp index 508fcafe0..70226f44f 100644 --- a/opm/simulators/linalg/bda/fgpilu.cpp +++ b/opm/simulators/linalg/bda/fgpilu.cpp @@ -45,6 +45,7 @@ namespace bda // PARALLEL 1 should be run with at least 32 workitems per workgroup. // The recommended number of workgroups for both options is Nb, which gives every row their own workgroup. // PARALLEL 0 is generally faster, despite not having parallelization. +// only 3x3 blocks are supported #define PARALLEL 0 @@ -54,6 +55,8 @@ inline const char* fgpilu_sweep_s = R"( #pragma OPENCL EXTENSION cl_khr_fp64 : enable +// subtract blocks: a = a - b * c +// the output block has 9 entries, each entry is calculated by 1 thread void blockMultSub( __local double * restrict a, __global const double * restrict b, @@ -74,7 +77,8 @@ void blockMultSub( } } - +// multiply blocks: resMat = mat1 * mat2 +// the output block has 9 entries, each entry is calculated by 1 thread void blockMult( __local const double * restrict mat1, __local const double * restrict mat2, @@ -95,13 +99,14 @@ void blockMult( } } - +// invert block: inverse = matrix^{-1} +// the output block has 9 entries, each entry is calculated by 1 thread void invert( __global const double * restrict matrix, __local double * restrict inverse) { const unsigned int block_size = 3; - const unsigned int bs = block_size; + const unsigned int bs = block_size; // rename to shorter name const unsigned int warp_size = 32; const unsigned int idx_t = get_local_id(0); // thread id in work group const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) @@ -128,6 +133,10 @@ void invert( } } +// perform the fixed-point iteration +// all entries in L and U are updated once +// output is written to [LU]tmp +// aij and ujj are local arrays whose size is specified before kernel launch __kernel void fgpilu_sweep( __global const double * restrict Ut_vals, __global const double * restrict L_vals, @@ -145,26 +154,30 @@ __kernel void fgpilu_sweep( __local double *ujj) { const int bs = 3; - - // for every row const unsigned int warp_size = 32; - const unsigned int bsize = get_local_size(0); - const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int work_group_size = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / work_group_size; const unsigned int num_groups = get_num_groups(0); - const unsigned int warps_per_group = bsize / warp_size; + const unsigned int warps_per_group = work_group_size / warp_size; const unsigned int idx_t = get_local_id(0); // thread id in work group const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) const unsigned int warp_id_in_group = idx_t / warp_size; - const unsigned int lmem_offset = warp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it + const unsigned int lmem_offset = warp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it + // every workitem in a warp has the same lmem_offset + + // for every row of L or every col of U for (int row = idx_b; row < Nb; row+=num_groups) { - int jColStart = Ut_rows[row]; + // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj}) + int jColStart = Ut_rows[row]; // actually colPointers to U int jColEnd = Ut_rows[row + 1]; + // for every block on this column for (int ij = jColStart + warp_id_in_group; ij < jColEnd; ij+=warps_per_group) { - int col = Ut_cols[ij]; + int rowU1 = Ut_cols[ij]; // actually rowIndices for U // refine Uij element (or diagonal) - int i1 = LU_rows[col]; - int i2 = LU_rows[col+1]; + int i1 = LU_rows[rowU1]; + int i2 = LU_rows[rowU1+1]; int kk = 0; + // LUmat->nnzValues[kk] is block Aij for(kk = i1; kk < i2; ++kk) { int c = LU_cols[kk]; if (c >= row) { @@ -172,40 +185,48 @@ __kernel void fgpilu_sweep( } } + // copy block Aij so operations can be done on it without affecting LUmat if(thread_id_in_warp < bs*bs){ aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp]; } - int jk = L_rows[col]; - int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb; - - for (int k = jColStart; k < ij; ++k) { - int ki = Ut_cols[k]; - while (ik < ki) { - ++jk; - ik = L_cols[jk]; - } - if (ik == ki) { - blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs); + int jk = L_rows[rowU1]; // points to row rowU1 in L + // if row rowU1 is empty: skip row. The whole warp looks at the same row, so no divergence + if (jk < L_rows[rowU1+1]) { + int colL = L_cols[jk]; + // only check until block U(i,j) is reached + for (int k = jColStart; k < ij; ++k) { + int rowU2 = Ut_cols[k]; // actually rowIndices for U + while (colL < rowU2) { + ++jk; // check next block on row rowU1 of L + colL = L_cols[jk]; + } + if (colL == rowU2) { + // Aij -= (Lik * Ukj) + blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs); + } } } + // Uij_new = Aij - sum + // write result of this sweep if(thread_id_in_warp < bs*bs){ Utmp[ij*bs*bs + thread_id_in_warp] = aij[lmem_offset + thread_id_in_warp]; } } // update L + // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj int iRowStart = L_rows[row]; int iRowEnd = L_rows[row + 1]; for (int ij = iRowStart + warp_id_in_group; ij < iRowEnd; ij+=warps_per_group) { - // for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) { int j = L_cols[ij]; // // refine Lij element int i1 = LU_rows[row]; int i2 = LU_rows[row+1]; int kk = 0; + // LUmat->nnzValues[kk] is block Aij for(kk = i1; kk < i2; ++kk) { int c = LU_cols[kk]; if (c >= j) { @@ -213,28 +234,32 @@ __kernel void fgpilu_sweep( } } + // copy block Aij so operations can be done on it without affecting LUmat if(thread_id_in_warp < bs*bs){ aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp]; } - int jk = Ut_rows[j]; - int ik = Ut_cols[jk]; + int jk = Ut_rows[j]; // actually colPointers, jk points to col j in U + int rowU = Ut_cols[jk]; // actually rowIndices, rowU is the row of block jk + // only check until block L(i,j) is reached for (int k = iRowStart; k < ij; ++k) { - int ki = L_cols[k]; - while(ik < ki) { - ++jk; - ik = Ut_cols[jk]; + int colL = L_cols[k]; + while(rowU < colL) { + ++jk; // check next block on col j of U + rowU = Ut_cols[jk]; } - if(ik == ki) { + if(rowU == colL) { + // Aij -= (Lik * Ukj) blockMultSub(aij+lmem_offset, L_vals + k * bs * bs , Ut_vals + jk * bs * bs); } } - // calculate aij / ujj + // calculate 1 / Ujj invert(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+lmem_offset); - // lij = aij / ujj + // Lij_new = (Aij - sum) / Ujj + // write result of this sweep blockMult(aij+lmem_offset, ujj+lmem_offset, Ltmp + ij * bs * bs); } } @@ -247,6 +272,8 @@ inline const char* fgpilu_sweep_s = R"( #pragma OPENCL EXTENSION cl_khr_fp64 : enable +// subtract blocks: a = a - b * c +// only one workitem performs this action void blockMultSub( __local double * restrict a, __global const double * restrict b, @@ -264,7 +291,8 @@ void blockMultSub( } } - +// multiply blocks: resMat = mat1 * mat2 +// only one workitem performs this action void blockMult( __local const double * restrict mat1, __local const double * restrict mat2, @@ -282,7 +310,8 @@ void blockMult( } } - +// invert block: inverse = matrix^{-1} +// only one workitem performs this action __kernel void inverter( __global const double * restrict matrix, __local double * restrict inverse) @@ -310,6 +339,10 @@ __kernel void inverter( inverse[8] = (t4 - t8) * t17; } +// perform the fixed-point iteration +// all entries in L and U are updated once +// output is written to [LU]tmp +// aij and ujj are local arrays whose size is specified before kernel launch __kernel void fgpilu_sweep( __global const double * restrict Ut_vals, __global const double * restrict L_vals, @@ -328,24 +361,28 @@ __kernel void fgpilu_sweep( { const int bs = 3; - // for every row const unsigned int warp_size = 32; - const unsigned int bsize = get_local_size(0); - const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int work_group_size = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / work_group_size; const unsigned int num_groups = get_num_groups(0); - const unsigned int warps_per_group = bsize / warp_size; + const unsigned int warps_per_group = work_group_size / warp_size; const unsigned int idx_t = get_local_id(0); // thread id in work group const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads) const unsigned int warp_id_in_group = idx_t / warp_size; + + // for every row of L or every col of U for (int row = idx_b; row < Nb; row+=num_groups) { - int jColStart = Ut_rows[row]; + // Uij = (Aij - sum k=1 to i-1 {Lik*Ukj}) + int jColStart = Ut_rows[row]; // actually colPointers to U int jColEnd = Ut_rows[row + 1]; - for (int ij = jColStart + idx_t; ij < jColEnd; ij+=bsize) { - int col = Ut_cols[ij]; + // for every block on this column + for (int ij = jColStart + idx_t; ij < jColEnd; ij+=work_group_size) { + int rowU1 = Ut_cols[ij]; // actually rowIndices for U // refine Uij element (or diagonal) - int i1 = LU_rows[col]; - int i2 = LU_rows[col+1]; + int i1 = LU_rows[rowU1]; + int i2 = LU_rows[rowU1+1]; int kk = 0; + // LUmat->nnzValues[kk] is block Aij for(kk = i1; kk < i2; ++kk) { int c = LU_cols[kk]; if (c >= row) { @@ -353,39 +390,47 @@ __kernel void fgpilu_sweep( } } + // copy block Aij so operations can be done on it without affecting LUmat for(int z = 0; z < bs*bs; ++z){ aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z]; } - int jk = L_rows[col]; - int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb; + int jk = L_rows[rowU1]; + // if row rowU1 is empty: do not sum. The workitems have different rowU1 values, divergence is possible + int colL = (jk < L_rows[rowU1+1]) ? L_cols[jk] : Nb; + // only check until block U(i,j) is reached for (int k = jColStart; k < ij; ++k) { - int ki = Ut_cols[k]; - while (ik < ki) { - ++jk; - ik = L_cols[jk]; + int rowU2 = Ut_cols[k]; // actually rowIndices for U + while (colL < rowU2) { + ++jk; // check next block on row rowU1 of L + colL = L_cols[jk]; } - if (ik == ki) { + if (colL == rowU2) { + // Aij -= (Lik * Ukj) blockMultSub(aij+idx_t*bs*bs, L_vals + jk * bs * bs, Ut_vals + k * bs * bs); } } + // Uij_new = Aij - sum + // write result of this sweep for(int z = 0; z < bs*bs; ++z){ Utmp[ij*bs*bs + z] = aij[idx_t*bs*bs+z]; } } // update L + // Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj int iRowStart = L_rows[row]; int iRowEnd = L_rows[row + 1]; - for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) { + for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=work_group_size) { int j = L_cols[ij]; // // refine Lij element int i1 = LU_rows[row]; int i2 = LU_rows[row+1]; int kk = 0; + // LUmat->nnzValues[kk] is block Aij for(kk = i1; kk < i2; ++kk) { int c = LU_cols[kk]; if (c >= j) { @@ -393,27 +438,31 @@ __kernel void fgpilu_sweep( } } + // copy block Aij so operations can be done on it without affecting LUmat for(int z = 0; z < bs*bs; ++z){ aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z]; } - int jk = Ut_rows[j]; - int ik = Ut_cols[jk]; + int jk = Ut_rows[j]; // actually colPointers, jk points to col j in U + int rowU = Ut_cols[jk]; // actually rowIndices, rowU is the row of block jk + // only check until block L(i,j) is reached for (int k = iRowStart; k < ij; ++k) { - int ki = L_cols[k]; - while(ik < ki) { - ++jk; - ik = Ut_cols[jk]; + int colL = L_cols[k]; + while(rowU < colL) { + ++jk; // check next block on col j of U + rowU = Ut_cols[jk]; } - if(ik == ki) { + if(rowU == colL) { + // Aij -= (Lik * Ukj) blockMultSub(aij+idx_t*bs*bs, L_vals + k * bs * bs , Ut_vals + jk * bs * bs); } } - // calculate aij / ujj + // calculate 1 / ujj inverter(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+idx_t*bs*bs); - // lij = aij / ujj + // Lij_new = (Aij - sum) / Ujj + // write result of this sweep blockMult(aij+idx_t*bs*bs, ujj+idx_t*bs*bs, Ltmp + ij * bs * bs); } } From 837a83fc16b601d4abc68aa859e58c9abdf04430 Mon Sep 17 00:00:00 2001 From: tqiu Date: Tue, 26 Jan 2021 17:32:18 +0100 Subject: [PATCH 10/14] Added error checking and std::call_once --- opm/simulators/linalg/bda/fgpilu.cpp | 13 +++++++++---- opm/simulators/linalg/bda/fgpilu.hpp | 6 +++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp index 70226f44f..d57ca981a 100644 --- a/opm/simulators/linalg/bda/fgpilu.cpp +++ b/opm/simulators/linalg/bda/fgpilu.cpp @@ -487,9 +487,13 @@ void FGPILU::decomposition( const int block_size = 3; try { - if (initialized == false) { + // just put everything in the capture list + std::call_once(initialize_flag, [&](){ cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector >' cl::Program program = cl::Program(*context, source, &err); + if (err != CL_SUCCESS) { + OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Program"); + } std::vector devices = context->getInfo(); program.build(devices); @@ -499,6 +503,9 @@ void FGPILU::decomposition( cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "fgpilu_sweep", &err))); + if (err != CL_SUCCESS) { + OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Kernel"); + } // allocate GPU memory d_Ut_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size); @@ -530,9 +537,7 @@ void FGPILU::decomposition( out << "FGPILU PARALLEL: " << PARALLEL; OpmLog::info(out.str()); } - - initialized = true; - } + }); // copy to GPU diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/fgpilu.hpp index d51c21061..880db43b2 100644 --- a/opm/simulators/linalg/bda/fgpilu.hpp +++ b/opm/simulators/linalg/bda/fgpilu.hpp @@ -21,8 +21,9 @@ #define FGPILU_HEADER_INCLUDED -#include +#include +#include namespace bda @@ -43,6 +44,7 @@ namespace bda cl::Event event; cl_int err; + std::once_flag initialize_flag; std::unique_ptr > fgpilu_sweep_k; - bool initialized = false; - public: /// Executes the FGPILU sweeps /// also copies data from CPU to GPU and GPU to CPU From d222dffcfdb7d37284007d6398caedae5f5eaf7c Mon Sep 17 00:00:00 2001 From: tqiu Date: Thu, 28 Jan 2021 13:46:01 +0100 Subject: [PATCH 11/14] Improved initial guess of L --- opm/simulators/linalg/bda/BILU0.cpp | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index bdfdbe8b2..d66ea05ab 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -169,15 +169,32 @@ namespace bda // Ut stores diagonal for now // Ut is actually BSC format - std::unique_ptr > Ut = std::make_unique >(Umat->Nb, Umat->nnzbs + Umat->Nb); + std::unique_ptr > Ut = std::make_unique >(Nb, (nnzbs + Nb) / 2); Lmat->rowPointers[0] = 0; for (int i = 0; i < Nb+1; i++) { Ut->rowPointers[i] = 0; } + Opm::Detail::Inverter inverter; + + // store inverted diagonal + for (int i = 0; i < Nb; i++) { + int iRowStart = LUmat->rowPointers[i]; + int iRowEnd = LUmat->rowPointers[i + 1]; + // for every block in this row + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUmat->colIndices[ij]; + if (i == j) { + inverter(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs); + } + } + } + + // initialize initial guess for L: L_A * D + // L_A is strictly lower triangular part of A + // D is inv(diag(A)) int num_blocks_L = 0; - // for every row for (int i = 0; i < Nb; i++) { int iRowStart = LUmat->rowPointers[i]; int iRowEnd = LUmat->rowPointers[i + 1]; @@ -188,7 +205,8 @@ namespace bda Ut->rowPointers[j+1]++; // actually colPointers, now simply indicates how many blocks this col holds } else { Lmat->colIndices[num_blocks_L] = j; - std::copy(LUmat->nnzValues + ij * bs * bs, LUmat->nnzValues + (ij+1) * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs); + // multiply block of L with corresponding diag block + blockMult(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs); num_blocks_L++; } } @@ -199,7 +217,7 @@ namespace bda // prefix sum to sum rowsizes into colpointers std::partial_sum(Ut->rowPointers, Ut->rowPointers+Nb+1, Ut->rowPointers); - // for every row + // initialize initial guess for U for (int i = 0; i < Nb; i++) { int iRowStart = LUmat->rowPointers[i]; int iRowEnd = LUmat->rowPointers[i + 1]; @@ -222,7 +240,6 @@ namespace bda } Ut->rowPointers[0] = 0; - Opm::Detail::Inverter inverter; // Utmp is needed for CPU and GPU decomposition, because U is transposed, and reversed after decomposition // U will be reversed because it is used with backwards substitution, the last row is used first From 363cc3131625a4a8130c549c8650e91147c8f891 Mon Sep 17 00:00:00 2001 From: tqiu Date: Thu, 28 Jan 2021 15:40:10 +0100 Subject: [PATCH 12/14] Made waiting for a GPUcopy more explicit --- opm/simulators/linalg/bda/fgpilu.cpp | 38 +++++++++++++++++----------- opm/simulators/linalg/bda/fgpilu.hpp | 1 + 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/fgpilu.cpp index d57ca981a..ea009c1cd 100644 --- a/opm/simulators/linalg/bda/fgpilu.cpp +++ b/opm/simulators/linalg/bda/fgpilu.cpp @@ -521,12 +521,15 @@ void FGPILU::decomposition( d_Utmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size); Dune::Timer t_copy_pattern; - err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_TRUE, 0, sizeof(int) * (Nb+1), Ut_ptrs); - err |= queue->enqueueWriteBuffer(d_L_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), L_rows); - err |= queue->enqueueWriteBuffer(d_LU_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), LU_rows); - err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_TRUE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs); - err |= queue->enqueueWriteBuffer(d_L_cols, CL_TRUE, 0, sizeof(int) * L_nnzbs, L_cols); - err |= queue->enqueueWriteBuffer(d_LU_cols, CL_TRUE, 0, sizeof(int) * LU_nnzbs, LU_cols); + events.resize(6); + err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_FALSE, 0, sizeof(int) * (Nb+1), Ut_ptrs, nullptr, &events[0]); + err |= queue->enqueueWriteBuffer(d_L_rows, CL_FALSE, 0, sizeof(int) * (Nb+1), L_rows, nullptr, &events[1]); + err |= queue->enqueueWriteBuffer(d_LU_rows, CL_FALSE, 0, sizeof(int) * (Nb+1), LU_rows, nullptr, &events[2]); + err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_FALSE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs, nullptr, &events[3]); + err |= queue->enqueueWriteBuffer(d_L_cols, CL_FALSE, 0, sizeof(int) * L_nnzbs, L_cols, nullptr, &events[4]); + err |= queue->enqueueWriteBuffer(d_LU_cols, CL_FALSE, 0, sizeof(int) * LU_nnzbs, LU_cols, nullptr, &events[5]); + cl::WaitForEvents(events); + events.clear(); if (verbosity >= 3){ std::ostringstream out; out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s"; @@ -542,9 +545,12 @@ void FGPILU::decomposition( // copy to GPU Dune::Timer t_copy1; - err = queue->enqueueWriteBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals); - err |= queue->enqueueWriteBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals); - err |= queue->enqueueWriteBuffer(d_LU_vals, CL_TRUE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals); + events.resize(3); + err = queue->enqueueWriteBuffer(d_Ut_vals, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]); + err |= queue->enqueueWriteBuffer(d_L_vals, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]); + err |= queue->enqueueWriteBuffer(d_LU_vals, CL_FALSE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals, nullptr, &events[2]); + cl::WaitForEvents(events); + events.clear(); if (verbosity >= 3){ std::ostringstream out; out << "FGPILU copy1 time: " << t_copy1.stop() << " s"; @@ -582,21 +588,23 @@ void FGPILU::decomposition( event.wait(); if (verbosity >= 3){ std::ostringstream out; - out << "FGPILU sweep kernel time: " << t_copy1.stop() << " s"; + out << "FGPILU sweep kernel time: " << t_kernel.stop() << " s"; OpmLog::info(out.str()); } } // copy back Dune::Timer t_copy2; + events.resize(2); if (num_sweeps % 2 == 0) { - err = queue->enqueueReadBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals); - err |= queue->enqueueReadBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals); + err = queue->enqueueReadBuffer(d_Ut_vals, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]); + err |= queue->enqueueReadBuffer(d_L_vals, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]); } else { - err = queue->enqueueReadBuffer(d_Utmp, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals); - err |= queue->enqueueReadBuffer(d_Ltmp, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals); + err = queue->enqueueReadBuffer(d_Utmp, CL_FALSE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals, nullptr, &events[0]); + err |= queue->enqueueReadBuffer(d_Ltmp, CL_FALSE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals, nullptr, &events[1]); } - err |= queue->enqueueBarrierWithWaitList(); + cl::WaitForEvents(events); + events.clear(); if (verbosity >= 3){ std::ostringstream out; out << "FGPILU copy2 time: " << t_copy2.stop() << " s"; diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/fgpilu.hpp index 880db43b2..b321e45b1 100644 --- a/opm/simulators/linalg/bda/fgpilu.hpp +++ b/opm/simulators/linalg/bda/fgpilu.hpp @@ -43,6 +43,7 @@ namespace bda cl::Buffer d_Ltmp, d_Utmp; cl::Event event; + std::vector events; cl_int err; std::once_flag initialize_flag; From a64a342104002640b160cf0427814bffa24c7b9a Mon Sep 17 00:00:00 2001 From: tqiu Date: Mon, 1 Feb 2021 11:19:38 +0100 Subject: [PATCH 13/14] Added symmetry check in Debug mode --- opm/simulators/linalg/bda/BILU0.cpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index d66ea05ab..fc79ffed2 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -167,6 +167,34 @@ namespace bda // split matrix into L and U // also convert U into BSC format (Ut) // Ut stores diagonal for now + // original matrix LUmat is assumed to be symmetric + +#ifndef NDEBUG + // verify that matrix is symmetric + for (int i = 0; i < Nb; ++i){ + int iRowStart = LUmat->rowPointers[i]; + int iRowEnd = LUmat->rowPointers[i + 1]; + // for every block (i, j) in this row, check if (j, i) also exists + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUmat->colIndices[ij]; + int jRowStart = LUmat->rowPointers[j]; + int jRowEnd = LUmat->rowPointers[j + 1]; + bool blockFound = false; + // check all blocks on row j + // binary search is possible + for (int ji = jRowStart; ji < jRowEnd; ji++) { + int row = LUmat->colIndices[ji]; + if (i == row) { + blockFound = true; + break; + } + } + if (false == blockFound) { + OPM_THROW(std::logic_error, "Error sparsity pattern must be symmetric when using chow_patel_decomposition()"); + } + } + } +#endif // Ut is actually BSC format std::unique_ptr > Ut = std::make_unique >(Nb, (nnzbs + Nb) / 2); From c8dca99fad615163b22f3857fde0c81b52fcdd1b Mon Sep 17 00:00:00 2001 From: tqiu Date: Wed, 3 Feb 2021 17:43:54 +0100 Subject: [PATCH 14/14] Renamed fgpilu to ChowPatelIlu --- CMakeLists_files.cmake | 4 +- opm/simulators/linalg/bda/BILU0.cpp | 4 +- opm/simulators/linalg/bda/BILU0.hpp | 4 +- .../bda/{fgpilu.cpp => ChowPatelIlu.cpp} | 39 +++++++++---------- .../bda/{fgpilu.hpp => ChowPatelIlu.hpp} | 12 +++--- 5 files changed, 31 insertions(+), 32 deletions(-) rename opm/simulators/linalg/bda/{fgpilu.cpp => ChowPatelIlu.cpp} (94%) rename opm/simulators/linalg/bda/{fgpilu.hpp => ChowPatelIlu.hpp} (93%) diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 70cb66a2a..5259d65b4 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -58,7 +58,7 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp) - list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/fgpilu.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) @@ -169,7 +169,7 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/BlockedMatrix.hpp opm/simulators/linalg/bda/cuda_header.hpp opm/simulators/linalg/bda/cusparseSolverBackend.hpp - opm/simulators/linalg/bda/fgpilu.hpp + opm/simulators/linalg/bda/ChowPatelIlu.hpp opm/simulators/linalg/bda/Reorder.hpp opm/simulators/linalg/bda/ILUReorder.hpp opm/simulators/linalg/bda/opencl.hpp diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index fc79ffed2..991dc14b1 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -25,7 +25,7 @@ #include #include -#include +#include #include @@ -276,7 +276,7 @@ namespace bda // actual ILU decomposition #if CHOW_PATEL_GPU - fgpilu.decomposition(queue, context, + chowPatelIlu.decomposition(queue, context, Ut->rowPointers, Ut->colIndices, Ut->nnzValues, Ut->nnzbs, Lmat->rowPointers, Lmat->colIndices, Lmat->nnzValues, Lmat->nnzbs, LUmat->rowPointers, LUmat->colIndices, LUmat->nnzValues, LUmat->nnzbs, diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index 5a31c57a9..67d945851 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -24,7 +24,7 @@ #include #include -#include +#include namespace bda { @@ -68,7 +68,7 @@ namespace bda int lmem_per_work_group = 0; bool pattern_uploaded = false; - FGPILU fgpilu; + ChowPatelIlu chowPatelIlu; void chow_patel_decomposition(); diff --git a/opm/simulators/linalg/bda/fgpilu.cpp b/opm/simulators/linalg/bda/ChowPatelIlu.cpp similarity index 94% rename from opm/simulators/linalg/bda/fgpilu.cpp rename to opm/simulators/linalg/bda/ChowPatelIlu.cpp index ea009c1cd..894b3f05c 100644 --- a/opm/simulators/linalg/bda/fgpilu.cpp +++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp @@ -22,7 +22,7 @@ #include #include -#include +#include namespace bda { @@ -50,8 +50,7 @@ namespace bda #define PARALLEL 0 #if PARALLEL - -inline const char* fgpilu_sweep_s = R"( +inline const char* chow_patel_ilu_sweep_s = R"( #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -137,7 +136,7 @@ void invert( // all entries in L and U are updated once // output is written to [LU]tmp // aij and ujj are local arrays whose size is specified before kernel launch -__kernel void fgpilu_sweep( +__kernel void chow_patel_ilu_sweep( __global const double * restrict Ut_vals, __global const double * restrict L_vals, __global const double * restrict LU_vals, @@ -268,7 +267,7 @@ __kernel void fgpilu_sweep( #else -inline const char* fgpilu_sweep_s = R"( +inline const char* chow_patel_ilu_sweep_s = R"( #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -343,7 +342,7 @@ __kernel void inverter( // all entries in L and U are updated once // output is written to [LU]tmp // aij and ujj are local arrays whose size is specified before kernel launch -__kernel void fgpilu_sweep( +__kernel void chow_patel_ilu_sweep( __global const double * restrict Ut_vals, __global const double * restrict L_vals, __global const double * restrict LU_vals, @@ -477,7 +476,7 @@ __kernel void fgpilu_sweep( -void FGPILU::decomposition( +void ChowPatelIlu::decomposition( cl::CommandQueue *queue, cl::Context *context, int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs, int *L_rows, int *L_cols, double *L_vals, int L_nnzbs, @@ -489,22 +488,22 @@ void FGPILU::decomposition( try { // just put everything in the capture list std::call_once(initialize_flag, [&](){ - cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector >' + cl::Program::Sources source(1, std::make_pair(chow_patel_ilu_sweep_s, strlen(chow_patel_ilu_sweep_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector >' cl::Program program = cl::Program(*context, source, &err); if (err != CL_SUCCESS) { - OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Program"); + OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL could not create Program"); } std::vector devices = context->getInfo(); program.build(devices); - fgpilu_sweep_k.reset(new cl::make_kernel(cl::Kernel(program, "fgpilu_sweep", &err))); + const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "chow_patel_ilu_sweep", &err))); if (err != CL_SUCCESS) { - OPM_THROW(std::logic_error, "FGPILU OpenCL could not create Kernel"); + OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL could not create Kernel"); } // allocate GPU memory @@ -532,12 +531,12 @@ void FGPILU::decomposition( events.clear(); if (verbosity >= 3){ std::ostringstream out; - out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s"; + out << "ChowPatelIlu copy sparsity pattern time: " << t_copy_pattern.stop() << " s"; OpmLog::info(out.str()); } if (verbosity >= 2){ std::ostringstream out; - out << "FGPILU PARALLEL: " << PARALLEL; + out << "ChowPatelIlu PARALLEL: " << PARALLEL; OpmLog::info(out.str()); } }); @@ -553,12 +552,12 @@ void FGPILU::decomposition( events.clear(); if (verbosity >= 3){ std::ostringstream out; - out << "FGPILU copy1 time: " << t_copy1.stop() << " s"; + out << "ChowPatelIlu copy1 time: " << t_copy1.stop() << " s"; OpmLog::info(out.str()); } if (err != CL_SUCCESS) { // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL - OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueWriteBuffer error"); + OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL enqueueWriteBuffer error"); } // call kernel @@ -580,7 +579,7 @@ void FGPILU::decomposition( int total_work_items = num_work_groups * work_group_size; int lmem_per_work_group = work_group_size * block_size * block_size * sizeof(double); Dune::Timer t_kernel; - event = (*fgpilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + event = (*chow_patel_ilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), *Uarg1, *Larg1, d_LU_vals, d_Ut_ptrs, d_L_rows, d_LU_rows, d_Ut_idxs, d_L_cols, d_LU_cols, @@ -588,7 +587,7 @@ void FGPILU::decomposition( event.wait(); if (verbosity >= 3){ std::ostringstream out; - out << "FGPILU sweep kernel time: " << t_kernel.stop() << " s"; + out << "ChowPatelIlu sweep kernel time: " << t_kernel.stop() << " s"; OpmLog::info(out.str()); } } @@ -607,12 +606,12 @@ void FGPILU::decomposition( events.clear(); if (verbosity >= 3){ std::ostringstream out; - out << "FGPILU copy2 time: " << t_copy2.stop() << " s"; + out << "ChowPatelIlu copy2 time: " << t_copy2.stop() << " s"; OpmLog::info(out.str()); } if (err != CL_SUCCESS) { // enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL - OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueReadBuffer error"); + OPM_THROW(std::logic_error, "ChowPatelIlu OpenCL enqueueReadBuffer error"); } } catch (const cl::Error& error) { diff --git a/opm/simulators/linalg/bda/fgpilu.hpp b/opm/simulators/linalg/bda/ChowPatelIlu.hpp similarity index 93% rename from opm/simulators/linalg/bda/fgpilu.hpp rename to opm/simulators/linalg/bda/ChowPatelIlu.hpp index b321e45b1..ff58de996 100644 --- a/opm/simulators/linalg/bda/fgpilu.hpp +++ b/opm/simulators/linalg/bda/ChowPatelIlu.hpp @@ -17,8 +17,8 @@ along with OPM. If not, see . */ -#ifndef FGPILU_HEADER_INCLUDED -#define FGPILU_HEADER_INCLUDED +#ifndef CHOW_PATEL_ILU_HEADER_INCLUDED +#define CHOW_PATEL_ILU_HEADER_INCLUDED #include @@ -33,7 +33,7 @@ namespace bda // FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896 // only blocksize == 3 is supported // decomposition() allocates the cl::Buffers on the first call, these are C++ objects that deallocate automatically - class FGPILU + class ChowPatelIlu { private: cl::Buffer d_Ut_vals, d_L_vals, d_LU_vals; @@ -51,10 +51,10 @@ namespace bda cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, - const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > fgpilu_sweep_k; + const int, cl::LocalSpaceArg, cl::LocalSpaceArg> > chow_patel_ilu_sweep_k; public: - /// Executes the FGPILU sweeps + /// Executes the ChowPatelIlu sweeps /// also copies data from CPU to GPU and GPU to CPU /// \param[in] queue OpenCL commandqueue /// \param[in] context OpenCL context @@ -84,4 +84,4 @@ namespace bda } // end namespace bda -#endif // FGPILU_HEADER_INCLUDED +#endif // CHOW_PATEL_ILU_HEADER_INCLUDED