diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 5259d65b4..500c31102 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -60,6 +60,7 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index d03fd7325..aeca4817e 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -26,6 +26,7 @@ #include #include +#include #include // if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU @@ -82,8 +83,8 @@ namespace bda #endif } GPU_storage; - cl::make_kernel *ILU_apply1; - cl::make_kernel *ILU_apply2; + ilu_apply1_kernel_type *ILU_apply1; + ilu_apply2_kernel_type *ILU_apply2; cl::make_kernel *ilu_decomp_k; diff --git a/opm/simulators/linalg/bda/openclKernels.cpp b/opm/simulators/linalg/bda/openclKernels.cpp new file mode 100644 index 000000000..a78be7451 --- /dev/null +++ b/opm/simulators/linalg/bda/openclKernels.cpp @@ -0,0 +1,634 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include + +namespace bda +{ + + std::string get_axpy_string() { + return R"( + __kernel void axpy( + __global double *in, + const double a, + __global double *out, + const int N) + { + unsigned int NUM_THREADS = get_global_size(0); + int idx = get_global_id(0); + + while(idx < N){ + out[idx] += a * in[idx]; + idx += NUM_THREADS; + } + } + )"; + } + + + // returns partial sums, instead of the final dot product + std::string get_dot_1_string() { + return R"( + __kernel void dot_1( + __global double *in1, + __global double *in2, + __global double *out, + const unsigned int N, + __local double *tmp) + { + unsigned int tid = get_local_id(0); + unsigned int bsize = get_local_size(0); + unsigned int bid = get_global_id(0) / bsize; + unsigned int i = get_global_id(0); + unsigned int NUM_THREADS = get_global_size(0); + + double sum = 0.0; + while(i < N){ + sum += in1[i] * in2[i]; + i += NUM_THREADS; + } + tmp[tid] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) + { + if (tid < s) + { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) out[get_group_id(0)] = tmp[0]; + } + )"; + } + + + // returns partial sums, instead of the final norm + // the square root must be computed on CPU + std::string get_norm_string() { + return R"( + __kernel void norm( + __global double *in, + __global double *out, + const unsigned int N, + __local double *tmp) + { + unsigned int tid = get_local_id(0); + unsigned int bsize = get_local_size(0); + unsigned int bid = get_global_id(0) / bsize; + unsigned int i = get_global_id(0); + unsigned int NUM_THREADS = get_global_size(0); + + double local_sum = 0.0; + while(i < N){ + local_sum += in[i] * in[i]; + i += NUM_THREADS; + } + tmp[tid] = local_sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) + { + if (tid < s) + { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) out[get_group_id(0)] = tmp[0]; + } + )"; + } + + + // p = (p - omega * v) * beta + r + std::string get_custom_string() { + return R"( + __kernel void custom( + __global double *p, + __global double *v, + __global double *r, + const double omega, + const double beta, + const int N) + { + const unsigned int NUM_THREADS = get_global_size(0); + unsigned int idx = get_global_id(0); + + while(idx < N){ + double res = p[idx]; + res -= omega * v[idx]; + res *= beta; + res += r[idx]; + p[idx] = res; + idx += NUM_THREADS; + } + } + )"; + } + + + // b = mat * x + // algorithm based on: + // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, + // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 + std::string get_spmv_blocked_string() { + return R"( + __kernel void spmv_blocked( + __global const double *vals, + __global const int *cols, + __global const int *rows, + const int Nb, + __global const double *x, + __global double *b, + const unsigned int block_size, + __local double *tmp) + { + const unsigned int warpsize = 32; + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + unsigned int idx = idx_b * bsize + idx_t; + const unsigned int bs = block_size; + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int target_block_row = idx / warpsize; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + // for 3x3 blocks: + // num_active_threads: 27 + // num_blocks_per_warp: 3 + + while(target_block_row < Nb){ + unsigned int first_block = rows[target_block_row]; + unsigned int last_block = rows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + for(; block < last_block; block += num_blocks_per_warp){ + double x_elem = x[cols[block]*bs + c]; + double A_elem = vals[block*bs*bs + c + r*bs]; + local_out += x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + unsigned int row = target_block_row*bs + lane; + b[row] = tmp[lane]; + } + target_block_row += num_warps_in_grid; + } + } + )"; + } + + + + // ILU apply part 1: forward substitution + // solves L*x=y where L is a lower triangular sparse blocked matrix + std::string get_ILU_apply1_string(bool full_matrix) { + std::string s = R"( + __kernel void ILU_apply1( + __global const double *LUvals, + __global const unsigned int *LUcols, + __global const unsigned int *LUrows, + __global const int *diagIndex, + __global const double *y, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) + { + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx = get_global_id(0); + unsigned int target_block_row = idx / warpsize; + target_block_row += nodesPerColorPrefix[color]; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + const unsigned int first_block = LUrows[target_block_row]; + )"; + if (full_matrix) { + s += "const unsigned int last_block = diagIndex[target_block_row]; "; + } else { + s += "const unsigned int last_block = LUrows[target_block_row+1]; "; + } + s += R"( + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + if(lane < num_active_threads){ + if(lane < bs){ + local_out = y[target_block_row*bs+lane]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + const unsigned int row = target_block_row*bs + lane; + x[row] = tmp[lane]; + } + + target_block_row += num_warps_in_grid; + } + } + )"; + return s; + } + + + // ILU apply part 2: backward substitution + // solves U*x=y where L is a lower triangular sparse blocked matrix + std::string get_ILU_apply2_string(bool full_matrix) { + std::string s = R"( + __kernel void ILU_apply2( + __global const double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global const int *diagIndex, + __global const double *invDiagVals, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) + { + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx = get_global_id(0); + unsigned int target_block_row = idx / warpsize; + target_block_row += nodesPerColorPrefix[color]; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + const double relaxation = 0.9; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + )"; + if (full_matrix) { + s += "const unsigned int first_block = diagIndex[target_block_row] + 1; "; + } else { + s += "const unsigned int first_block = LUrows[target_block_row]; "; + } + s += R"( + const unsigned int last_block = LUrows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + if(lane < num_active_threads){ + if(lane < bs){ + const unsigned int row = target_block_row*bs+lane; + local_out = x[row]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + local_out = tmp[lane]; + + if(lane < bs){ + tmp[lane + bs*idx_t/warpsize] = local_out; + double sum = 0.0; + for(int i = 0; i < bs; ++i){ + sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; + } + + const unsigned int row = target_block_row*bs + lane; + x[row] = relaxation * sum; + } + + target_block_row += num_warps_in_grid; + } + } + )"; + return s; + } + + /// Generate string with the stdwell_apply kernels + /// If reorder is true, the B/Ccols do not correspond with the x/y vector + /// the x/y vector is reordered, use toOrder to address that + /// \param[in] reorder whether the matrix is reordered or not + std::string get_stdwell_apply_string(bool reorder) { + std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder"; + std::string s = "__kernel void " + kernel_name + R"(( + __global const double *Cnnzs, + __global const double *Dnnzs, + __global const double *Bnnzs, + __global const int *Ccols, + __global const int *Bcols, + __global const double *x, + __global double *y, + )"; + if (reorder) { + s += R"(__global const int *toOrder, + )"; + } + s += R"(const unsigned int dim, + const unsigned int dim_wells, + __global const unsigned int *val_pointers, + __local double *localSum, + __local double *z1, + __local double *z2){ + int wgId = get_group_id(0); + int wiId = get_local_id(0); + int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; + int valsPerBlock = dim*dim_wells; + int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; + int numBlocksPerWarp = get_local_size(0)/valsPerBlock; + int c = wiId % dim; + int r = (wiId/dim) % dim_wells; + double temp; + + barrier(CLK_LOCAL_MEM_FENCE); + + localSum[wiId] = 0; + if(wiId < numActiveWorkItems){ + int b = wiId/valsPerBlock + val_pointers[wgId]; + while(b < valSize + val_pointers[wgId]){ + )"; + if (reorder) { + s += "int colIdx = toOrder[Bcols[b]]; "; + } else { + s += "int colIdx = Bcols[b]; "; + } + s += R"( + localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; + b += numBlocksPerWarp; + } + + if(wiId < valsPerBlock){ + localSum[wiId] += localSum[wiId + valsPerBlock]; + } + + b = wiId/valsPerBlock + val_pointers[wgId]; + + if(c == 0 && wiId < valsPerBlock){ + for(unsigned int stride = 2; stride > 0; stride >>= 1){ + localSum[wiId] += localSum[wiId + stride]; + } + z1[r] = localSum[wiId]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim_wells){ + temp = 0.0; + for(unsigned int i = 0; i < dim_wells; ++i){ + temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; + } + z2[wiId] = temp; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim*valSize){ + temp = 0.0; + int bb = wiId/dim + val_pointers[wgId]; + for (unsigned int j = 0; j < dim_wells; ++j){ + temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; + } + )"; + if (reorder) { + s += "int colIdx = toOrder[Ccols[bb]]; "; + } else { + s += "int colIdx = Ccols[bb]; "; + } + s += R"( + y[colIdx*dim + c] -= temp; + } + } + )"; + return s; + } + + + std::string get_ilu_decomp_string() { + return R"( + + // a = a - (b * c) + __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c) + { + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += b[block_size * row + k] * c[block_size * k + col]; + } + a[block_size * row + col] -= temp; + } + } + + // c = a * b + __kernel void block_mult(__global double *a, __global double *b, __local double *c) { + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += a[block_size * row + k] * b[block_size * k + col]; + } + c[block_size * row + col] = temp; + } + } + + // invert 3x3 matrix + __kernel void inverter(__global double *matrix, __global double *inverse) { + const unsigned int block_size = 3; + const unsigned int bs = block_size; // rename to shorter name + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < bs * bs){ + double t4 = matrix[0] * matrix[4]; + double t6 = matrix[0] * matrix[5]; + double t8 = matrix[1] * matrix[3]; + double t10 = matrix[2] * matrix[3]; + double t12 = matrix[1] * matrix[6]; + double t14 = matrix[2] * matrix[6]; + + double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + + t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); + double t17 = 1.0 / det; + + const unsigned int r = thread_id_in_hwarp / bs; + const unsigned int c = thread_id_in_hwarp % bs; + const unsigned int r1 = (r+1) % bs; + const unsigned int c1 = (c+1) % bs; + const unsigned int r2 = (r+bs-1) % bs; + const unsigned int c2 = (c+bs-1) % bs; + inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17; + } + } + + __kernel void ilu_decomp(const unsigned int firstRow, + const unsigned int lastRow, + __global double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global double *invDiagVals, + __global int *diagIndex, + const unsigned int Nb, + __local double *pivot){ + + const unsigned int bs = 3; + const unsigned int hwarp_size = 16; + const unsigned int work_group_size = get_local_size(0); + const unsigned int work_group_id = get_group_id(0); + const unsigned int num_groups = get_num_groups(0); + const unsigned int hwarps_per_group = work_group_size / hwarp_size; + const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads) + const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size; + const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it + // every workitem in a hwarp has the same lmem_offset + + // go through all rows + for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) + { + int iRowStart = LUrows[i]; + int iRowEnd = LUrows[i + 1]; + + // go through all elements of the row + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUcols[ij]; + + if (j < i) { + // calculate the pivot of this row + block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); + + // copy pivot + if (thread_id_in_hwarp < bs * bs) { + LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp]; + } + + int jRowEnd = LUrows[j + 1]; + int jk = diagIndex[j] + 1; + int ik = ij + 1; + // substract that row scaled by the pivot from this row. + while (ik < iRowEnd && jk < jRowEnd) { + if (LUcols[ik] == LUcols[jk]) { + block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs); + ik++; + jk++; + } else { + if (LUcols[ik] < LUcols[jk]) + { ik++; } + else + { jk++; } + } + } + } + } + + // store the inverse in the diagonal + inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); + + // copy inverse + if (thread_id_in_hwarp < bs * bs) { + LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; + } + + } + } + )"; + } + +} // end namespace bda + diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index 20c7b36c8..826155f80 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -20,604 +20,63 @@ #ifndef OPENCL_HPP #define OPENCL_HPP +#include + +#include + namespace bda { - inline const char* axpy_s = R"( - __kernel void axpy( - __global double *in, - const double a, - __global double *out, - const int N) - { - unsigned int NUM_THREADS = get_global_size(0); - int idx = get_global_id(0); - - while(idx < N){ - out[idx] += a * in[idx]; - idx += NUM_THREADS; - } - } - )"; +using spmv_kernel_type = cl::make_kernel; +using ilu_apply1_kernel_type = cl::make_kernel; +using ilu_apply2_kernel_type = cl::make_kernel; +using stdwell_apply_kernel_type = cl::make_kernel; +using stdwell_apply_no_reorder_kernel_type = cl::make_kernel; +using ilu_decomp_kernel_type = cl::make_kernel; + std::string get_axpy_string(); // returns partial sums, instead of the final dot product - inline const char* dot_1_s = R"( - __kernel void dot_1( - __global double *in1, - __global double *in2, - __global double *out, - const unsigned int N, - __local double *tmp) - { - unsigned int tid = get_local_id(0); - unsigned int bsize = get_local_size(0); - unsigned int bid = get_global_id(0) / bsize; - unsigned int i = get_global_id(0); - unsigned int NUM_THREADS = get_global_size(0); - - double sum = 0.0; - while(i < N){ - sum += in1[i] * in2[i]; - i += NUM_THREADS; - } - tmp[tid] = sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - // do reduction in shared mem - for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) - { - if (tid < s) - { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // write result for this block to global mem - if (tid == 0) out[get_group_id(0)] = tmp[0]; - } - )"; - + std::string get_dot_1_string(); // returns partial sums, instead of the final norm // the square root must be computed on CPU - inline const char* norm_s = R"( - __kernel void norm( - __global double *in, - __global double *out, - const unsigned int N, - __local double *tmp) - { - unsigned int tid = get_local_id(0); - unsigned int bsize = get_local_size(0); - unsigned int bid = get_global_id(0) / bsize; - unsigned int i = get_global_id(0); - unsigned int NUM_THREADS = get_global_size(0); - - double local_sum = 0.0; - while(i < N){ - local_sum += in[i] * in[i]; - i += NUM_THREADS; - } - tmp[tid] = local_sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - // do reduction in shared mem - for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) - { - if (tid < s) - { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // write result for this block to global mem - if (tid == 0) out[get_group_id(0)] = tmp[0]; - } - )"; - + std::string get_norm_string(); // p = (p - omega * v) * beta + r - inline const char* custom_s = R"( - __kernel void custom( - __global double *p, - __global double *v, - __global double *r, - const double omega, - const double beta, - const int N) - { - const unsigned int NUM_THREADS = get_global_size(0); - unsigned int idx = get_global_id(0); - - while(idx < N){ - double res = p[idx]; - res -= omega * v[idx]; - res *= beta; - res += r[idx]; - p[idx] = res; - idx += NUM_THREADS; - } - } - )"; - + std::string get_custom_string(); // b = mat * x // algorithm based on: // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 - inline const char* spmv_blocked_s = R"( - __kernel void spmv_blocked( - __global const double *vals, - __global const int *cols, - __global const int *rows, - const int Nb, - __global const double *x, - __global double *b, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bsize = get_local_size(0); - const unsigned int idx_b = get_global_id(0) / bsize; - const unsigned int idx_t = get_local_id(0); - unsigned int idx = idx_b * bsize + idx_t; - const unsigned int bs = block_size; - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int target_block_row = idx / warpsize; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - - // for 3x3 blocks: - // num_active_threads: 27 - // num_blocks_per_warp: 3 - - while(target_block_row < Nb){ - unsigned int first_block = rows[target_block_row]; - unsigned int last_block = rows[target_block_row+1]; - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - - if(lane < num_active_threads){ - for(; block < last_block; block += num_blocks_per_warp){ - double x_elem = x[cols[block]*bs + c]; - double A_elem = vals[block*bs*bs + c + r*bs]; - local_out += x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(lane < bs){ - unsigned int row = target_block_row*bs + lane; - b[row] = tmp[lane]; - } - target_block_row += num_warps_in_grid; - } - } - )"; - - + std::string get_spmv_blocked_string(); // ILU apply part 1: forward substitution // solves L*x=y where L is a lower triangular sparse blocked matrix - std::string get_ILU_apply1_string(bool full_matrix) { - std::string s = R"( - __kernel void ILU_apply1( - __global const double *LUvals, - __global const unsigned int *LUcols, - __global const unsigned int *LUrows, - __global const int *diagIndex, - __global const double *y, - __global double *x, - __global const unsigned int *nodesPerColorPrefix, - const unsigned int color, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bs = block_size; - const unsigned int idx_t = get_local_id(0); - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int idx = get_global_id(0); - unsigned int target_block_row = idx / warpsize; - target_block_row += nodesPerColorPrefix[color]; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - - while(target_block_row < nodesPerColorPrefix[color+1]){ - const unsigned int first_block = LUrows[target_block_row]; - )"; - if (full_matrix) { - s += "const unsigned int last_block = diagIndex[target_block_row]; "; - } else { - s += "const unsigned int last_block = LUrows[target_block_row+1]; "; - } - s += R"( - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - if(lane < num_active_threads){ - if(lane < bs){ - local_out = y[target_block_row*bs+lane]; - } - for(; block < last_block; block += num_blocks_per_warp){ - const double x_elem = x[LUcols[block]*bs + c]; - const double A_elem = LUvals[block*bs*bs + c + r*bs]; - local_out -= x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(lane < bs){ - const unsigned int row = target_block_row*bs + lane; - x[row] = tmp[lane]; - } - - target_block_row += num_warps_in_grid; - } - } - )"; - return s; - } - + std::string get_ILU_apply1_string(bool full_matrix); // ILU apply part 2: backward substitution // solves U*x=y where L is a lower triangular sparse blocked matrix - std::string get_ILU_apply2_string(bool full_matrix) { - std::string s = R"( - __kernel void ILU_apply2( - __global const double *LUvals, - __global const int *LUcols, - __global const int *LUrows, - __global const int *diagIndex, - __global const double *invDiagVals, - __global double *x, - __global const unsigned int *nodesPerColorPrefix, - const unsigned int color, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bs = block_size; - const unsigned int idx_t = get_local_id(0); - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int idx = get_global_id(0); - unsigned int target_block_row = idx / warpsize; - target_block_row += nodesPerColorPrefix[color]; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - const double relaxation = 0.9; - - while(target_block_row < nodesPerColorPrefix[color+1]){ - )"; - if (full_matrix) { - s += "const unsigned int first_block = diagIndex[target_block_row] + 1; "; - } else { - s += "const unsigned int first_block = LUrows[target_block_row]; "; - } - s += R"( - const unsigned int last_block = LUrows[target_block_row+1]; - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - if(lane < num_active_threads){ - if(lane < bs){ - const unsigned int row = target_block_row*bs+lane; - local_out = x[row]; - } - for(; block < last_block; block += num_blocks_per_warp){ - const double x_elem = x[LUcols[block]*bs + c]; - const double A_elem = LUvals[block*bs*bs + c + r*bs]; - local_out -= x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - local_out = tmp[lane]; - - if(lane < bs){ - tmp[lane + bs*idx_t/warpsize] = local_out; - double sum = 0.0; - for(int i = 0; i < bs; ++i){ - sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; - } - - const unsigned int row = target_block_row*bs + lane; - x[row] = relaxation * sum; - } - - target_block_row += num_warps_in_grid; - } - } - )"; - return s; - } + std::string get_ILU_apply2_string(bool full_matrix); /// Generate string with the stdwell_apply kernels /// If reorder is true, the B/Ccols do not correspond with the x/y vector /// the x/y vector is reordered, use toOrder to address that /// \param[in] reorder whether the matrix is reordered or not - std::string get_stdwell_apply_string(bool reorder) { - std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder"; - std::string s = "__kernel void " + kernel_name + R"(( - __global const double *Cnnzs, - __global const double *Dnnzs, - __global const double *Bnnzs, - __global const int *Ccols, - __global const int *Bcols, - __global const double *x, - __global double *y, - )"; - if (reorder) { - s += R"(__global const int *toOrder, - )"; - } - s += R"(const unsigned int dim, - const unsigned int dim_wells, - __global const unsigned int *val_pointers, - __local double *localSum, - __local double *z1, - __local double *z2){ - int wgId = get_group_id(0); - int wiId = get_local_id(0); - int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; - int valsPerBlock = dim*dim_wells; - int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; - int numBlocksPerWarp = get_local_size(0)/valsPerBlock; - int c = wiId % dim; - int r = (wiId/dim) % dim_wells; - double temp; + std::string get_stdwell_apply_string(bool reorder); - barrier(CLK_LOCAL_MEM_FENCE); - - localSum[wiId] = 0; - if(wiId < numActiveWorkItems){ - int b = wiId/valsPerBlock + val_pointers[wgId]; - while(b < valSize + val_pointers[wgId]){ - )"; - if (reorder) { - s += "int colIdx = toOrder[Bcols[b]]; "; - } else { - s += "int colIdx = Bcols[b]; "; - } - s += R"( - localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; - b += numBlocksPerWarp; - } - - if(wiId < valsPerBlock){ - localSum[wiId] += localSum[wiId + valsPerBlock]; - } - - b = wiId/valsPerBlock + val_pointers[wgId]; - - if(c == 0 && wiId < valsPerBlock){ - for(unsigned int stride = 2; stride > 0; stride >>= 1){ - localSum[wiId] += localSum[wiId + stride]; - } - z1[r] = localSum[wiId]; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim_wells){ - temp = 0.0; - for(unsigned int i = 0; i < dim_wells; ++i){ - temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; - } - z2[wiId] = temp; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim*valSize){ - temp = 0.0; - int bb = wiId/dim + val_pointers[wgId]; - for (unsigned int j = 0; j < dim_wells; ++j){ - temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; - } - )"; - if (reorder) { - s += "int colIdx = toOrder[Ccols[bb]]; "; - } else { - s += "int colIdx = Ccols[bb]; "; - } - s += R"( - y[colIdx*dim + c] -= temp; - } - } - )"; - return s; - } - - - inline const char* ilu_decomp_s = R"( - - // a = a - (b * c) - __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c) - { - const unsigned int block_size = 3; - const unsigned int hwarp_size = 16; - const unsigned int idx_t = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) - if(thread_id_in_hwarp < block_size * block_size){ - const unsigned int row = thread_id_in_hwarp / block_size; - const unsigned int col = thread_id_in_hwarp % block_size; - double temp = 0.0; - for (unsigned int k = 0; k < block_size; k++) { - temp += b[block_size * row + k] * c[block_size * k + col]; - } - a[block_size * row + col] -= temp; - } - } - - // c = a * b - __kernel void block_mult(__global double *a, __global double *b, __local double *c) { - const unsigned int block_size = 3; - const unsigned int hwarp_size = 16; - const unsigned int idx_t = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) - if(thread_id_in_hwarp < block_size * block_size){ - const unsigned int row = thread_id_in_hwarp / block_size; - const unsigned int col = thread_id_in_hwarp % block_size; - double temp = 0.0; - for (unsigned int k = 0; k < block_size; k++) { - temp += a[block_size * row + k] * b[block_size * k + col]; - } - c[block_size * row + col] = temp; - } - } - - // invert 3x3 matrix - __kernel void inverter(__global double *matrix, __global double *inverse) { - const unsigned int block_size = 3; - const unsigned int bs = block_size; // rename to shorter name - const unsigned int hwarp_size = 16; - const unsigned int idx_t = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) - if(thread_id_in_hwarp < bs * bs){ - double t4 = matrix[0] * matrix[4]; - double t6 = matrix[0] * matrix[5]; - double t8 = matrix[1] * matrix[3]; - double t10 = matrix[2] * matrix[3]; - double t12 = matrix[1] * matrix[6]; - double t14 = matrix[2] * matrix[6]; - - double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + - t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); - double t17 = 1.0 / det; - - const unsigned int r = thread_id_in_hwarp / bs; - const unsigned int c = thread_id_in_hwarp % bs; - const unsigned int r1 = (r+1) % bs; - const unsigned int c1 = (c+1) % bs; - const unsigned int r2 = (r+bs-1) % bs; - const unsigned int c2 = (c+bs-1) % bs; - inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17; - } - } - - __kernel void ilu_decomp(const unsigned int firstRow, - const unsigned int lastRow, - __global double *LUvals, - __global const int *LUcols, - __global const int *LUrows, - __global double *invDiagVals, - __global int *diagIndex, - const unsigned int Nb, - __local double *pivot){ - - const unsigned int bs = 3; - const unsigned int hwarp_size = 16; - const unsigned int work_group_size = get_local_size(0); - const unsigned int work_group_id = get_group_id(0); - const unsigned int num_groups = get_num_groups(0); - const unsigned int hwarps_per_group = work_group_size / hwarp_size; - const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads) - const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size; - const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it - // every workitem in a hwarp has the same lmem_offset - - // go through all rows - for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) - { - int iRowStart = LUrows[i]; - int iRowEnd = LUrows[i + 1]; - - // go through all elements of the row - for (int ij = iRowStart; ij < iRowEnd; ij++) { - int j = LUcols[ij]; - - if (j < i) { - // calculate the pivot of this row - block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); - - // copy pivot - if (thread_id_in_hwarp < bs * bs) { - LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp]; - } - - int jRowEnd = LUrows[j + 1]; - int jk = diagIndex[j] + 1; - int ik = ij + 1; - // substract that row scaled by the pivot from this row. - while (ik < iRowEnd && jk < jRowEnd) { - if (LUcols[ik] == LUcols[jk]) { - block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs); - ik++; - jk++; - } else { - if (LUcols[ik] < LUcols[jk]) - { ik++; } - else - { jk++; } - } - } - } - } - - // store the inverse in the diagonal - inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); - - // copy inverse - if (thread_id_in_hwarp < bs * bs) { - LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; - } - - } - } - )"; + std::string get_ilu_decomp_string(); } // end namespace bda diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 64d6c23c2..3a93fe008 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -26,7 +26,6 @@ #include #include -#include #include #include @@ -524,29 +523,41 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub initialized = true; } // end initialize() +void add_kernel_string(cl::Program::Sources &sources, std::string &source) { + sources.emplace_back(std::make_pair(source.c_str(), source.size())); +} + template void openclSolverBackend::get_opencl_kernels() { - cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector >' - source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s))); - source.emplace_back(std::make_pair(norm_s, strlen(norm_s))); - source.emplace_back(std::make_pair(custom_s, strlen(custom_s))); - source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s))); + cl::Program::Sources sources; + std::string axpy_s = get_axpy_string(); + add_kernel_string(sources, axpy_s); + std::string dot_1_s = get_dot_1_string(); + add_kernel_string(sources, dot_1_s); + std::string norm_s = get_norm_string(); + add_kernel_string(sources, norm_s); + std::string custom_s = get_custom_string(); + add_kernel_string(sources, custom_s); + std::string spmv_blocked_s = get_spmv_blocked_string(); + add_kernel_string(sources, spmv_blocked_s); #if CHOW_PATEL - bool full_matrix = false; + bool ilu_operate_on_full_matrix = false; #else - bool full_matrix = true; + bool ilu_operate_on_full_matrix = true; #endif - std::string ILU_apply1_s = get_ILU_apply1_string(full_matrix); - source.emplace_back(std::make_pair(ILU_apply1_s.c_str(), strlen(ILU_apply1_s.c_str()))); - std::string ILU_apply2_s = get_ILU_apply2_string(full_matrix); - source.emplace_back(std::make_pair(ILU_apply2_s.c_str(), strlen(ILU_apply2_s.c_str()))); + std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix); + add_kernel_string(sources, ILU_apply1_s); + std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix); + add_kernel_string(sources, ILU_apply2_s); std::string stdwell_apply_s = get_stdwell_apply_string(true); + add_kernel_string(sources, stdwell_apply_s); std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false); - source.emplace_back(std::make_pair(stdwell_apply_s.c_str(), strlen(stdwell_apply_s.c_str()))); - source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s.c_str(), strlen(stdwell_apply_no_reorder_s.c_str()))); - source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s))); - cl::Program program = cl::Program(*context, source); + add_kernel_string(sources, stdwell_apply_no_reorder_s); + std::string ilu_decomp_s = get_ilu_decomp_string(); + add_kernel_string(sources, ilu_decomp_s); + + cl::Program program = cl::Program(*context, sources); program.build(devices); // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA @@ -557,19 +568,12 @@ void openclSolverBackend::get_opencl_kernels() { norm_k.reset(new cl::make_kernel(cl::Kernel(program, "norm"))); axpy_k.reset(new cl::make_kernel(cl::Kernel(program, "axpy"))); custom_k.reset(new cl::make_kernel(cl::Kernel(program, "custom"))); - spmv_blocked_k.reset(new cl::make_kernel(cl::Kernel(program, "spmv_blocked"))); - ILU_apply1_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply1"))); - ILU_apply2_k.reset(new cl::make_kernel(cl::Kernel(program, "ILU_apply2"))); - stdwell_apply_k.reset(new cl::make_kernel(cl::Kernel(program, "stdwell_apply"))); - stdwell_apply_no_reorder_k.reset(new cl::make_kernel(cl::Kernel(program, "stdwell_apply_no_reorder"))); - ilu_decomp_k.reset(new cl::make_kernel(cl::Kernel(program, "ilu_decomp"))); + spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked"))); + ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1"))); + ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2"))); + stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply"))); + stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder"))); + ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp"))); } // end get_opencl_kernels() template diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index e51304990..0f8dee8f1 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -21,6 +21,7 @@ #define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED #include +#include #include #include #include @@ -68,19 +69,12 @@ private: std::unique_ptr > norm_k; std::unique_ptr > axpy_k; std::unique_ptr > custom_k; - std::unique_ptr > spmv_blocked_k; - std::shared_ptr > ILU_apply1_k; - std::shared_ptr > ILU_apply2_k; - std::shared_ptr > stdwell_apply_k; - std::shared_ptr > stdwell_apply_no_reorder_k; - std::shared_ptr > ilu_decomp_k; + std::unique_ptr spmv_blocked_k; + std::shared_ptr ILU_apply1_k; + std::shared_ptr ILU_apply2_k; + std::shared_ptr stdwell_apply_k; + std::shared_ptr stdwell_apply_no_reorder_k; + std::shared_ptr ilu_decomp_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings