From c7ac97e2153fa106db27ded94bf5a701d365c550 Mon Sep 17 00:00:00 2001 From: Jose Eduardo Bueno Date: Fri, 10 Dec 2021 08:17:19 -0300 Subject: [PATCH] [OpenCL] Moves all OpenCL kernels to *.cl files. Kernel files are located in opm/simulators/linalg/bda/opencl/kernels. CMake will combine them for usage in ${PROJECT_BINARY_DIR}/clSources.cpp that becomes part of the library. --- CMakeLists.txt | 4 + opencl-source-provider.cmake | 50 + .../linalg/bda/opencl/kernels/ILU_apply1.cl | 68 ++ .../bda/opencl/kernels/ILU_apply1_fm.cl | 68 ++ .../linalg/bda/opencl/kernels/ILU_apply2.cl | 75 ++ .../bda/opencl/kernels/ILU_apply2_fm.cl | 77 ++ .../linalg/bda/opencl/kernels/ILU_decomp.cl | 137 +++ .../kernels/add_coarse_pressure_correction.cl | 17 + .../linalg/bda/opencl/kernels/axpy.cl | 15 + .../linalg/bda/opencl/kernels/custom.cl | 22 + .../linalg/bda/opencl/kernels/dot_1.cl | 37 + .../kernels/full_to_pressure_restriction.cl | 22 + .../linalg/bda/opencl/kernels/norm.cl | 36 + .../bda/opencl/kernels/prolongate_vector.cl | 16 + .../linalg/bda/opencl/kernels/residual.cl | 50 + .../bda/opencl/kernels/residual_blocked.cl | 68 ++ .../linalg/bda/opencl/kernels/scale.cl | 14 + .../linalg/bda/opencl/kernels/spmv.cl | 49 + .../linalg/bda/opencl/kernels/spmv_blocked.cl | 67 ++ .../linalg/bda/opencl/kernels/spmv_noreset.cl | 48 + .../bda/opencl/kernels/stdwell_apply.cl | 76 ++ .../kernels/stdwell_apply_no_reorder.cl | 74 ++ .../linalg/bda/opencl/kernels/vmul.cl | 17 + opm/simulators/linalg/bda/openclKernels.cpp | 877 +----------------- opm/simulators/linalg/bda/openclKernels.hpp | 97 +- 25 files changed, 1155 insertions(+), 926 deletions(-) create mode 100644 opencl-source-provider.cmake create mode 100644 opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/add_coarse_pressure_correction.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/axpy.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/custom.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/dot_1.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/full_to_pressure_restriction.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/norm.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/prolongate_vector.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/residual.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/scale.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/spmv.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl create mode 100644 opm/simulators/linalg/bda/opencl/kernels/vmul.cl diff --git a/CMakeLists.txt b/CMakeLists.txt index 391685534..c858ab51c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -287,6 +287,10 @@ macro (prereqs_hook) endmacro (prereqs_hook) macro (sources_hook) + if(OPENCL_FOUND) + include(opencl-source-provider) + list(APPEND opm-simulators_SOURCES ${PROJECT_BINARY_DIR}/clSources.cpp) + endif() endmacro (sources_hook) macro (fortran_hook) diff --git a/opencl-source-provider.cmake b/opencl-source-provider.cmake new file mode 100644 index 000000000..9b8ee7ca7 --- /dev/null +++ b/opencl-source-provider.cmake @@ -0,0 +1,50 @@ +set(BDA_DIR opm/simulators/linalg/bda) +set(KERNELS_DIR ${BDA_DIR}/opencl/kernels) + +if(DEBUG_KERNELS) + set(DEBUG_DIR ${KERNELS_DIR}/.debug) + + execute_process( + COMMAND ${CMAKE_COMMAND} -E make_directory ${DEBUG_DIR} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + ) +endif() + +set(CL_SRC_FILE ${PROJECT_BINARY_DIR}/clSources.cpp) +file(WRITE ${CL_SRC_FILE} "// This file is auto-generated. Do not edit!\n\n") +file(APPEND ${CL_SRC_FILE} "#include \"${BDA_DIR}/openclKernels.hpp\"\n\n") +file(APPEND ${CL_SRC_FILE} "namespace Opm\{\n\n") +file(APPEND ${CL_SRC_FILE} "namespace Accelerator\{\n\n") + +file(GLOB CL_LIST "${KERNELS_DIR}/*.cl") + +if(USE_CHOW_PATEL_ILU) + list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply1_fm.cl") + list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply2_fm.cl") +else() + list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply1.cl") + list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply2.cl") +endif() + +foreach(CL ${CL_LIST}) + get_filename_component(FNAME ${CL} NAME_WE) + + file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n") + file(READ "${CL}" CL_CONTENT) + file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}") + file(APPEND ${CL_SRC_FILE} "\)\"; \n\n") + + if(DEBUG_KERNELS) + execute_process( + COMMAND ocloc -file ${CL} -device kbl -out_dir ${DEBUG_DIR} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + ) + endif() +endforeach() + +file(APPEND ${CL_SRC_FILE} "\}\n") +file(APPEND ${CL_SRC_FILE} "\}") + +if(DEBUG_KERNELS) + file(REMOVE_RECURSE ${DEBUG_DIR}) +endif() diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl new file mode 100644 index 000000000..fb067995d --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl @@ -0,0 +1,68 @@ +/// ILU apply part 1: forward substitution. +/// Solves L*x=y where L is a lower triangular sparse blocked matrix. +/// Here, L is it's own BSR matrix. +__kernel void ILU_apply1( + __global const double *LUvals, + __global const unsigned int *LUcols, + __global const unsigned int *LUrows, + __global const int *diagIndex, + __global const double *y, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) +{ + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx = get_global_id(0); + unsigned int target_block_row = idx / warpsize; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + target_block_row += nodesPerColorPrefix[color]; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + const unsigned int first_block = LUrows[target_block_row]; + const unsigned int last_block = LUrows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + if(lane < bs){ + local_out = y[target_block_row*bs+lane]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + const unsigned int row = target_block_row*bs + lane; + x[row] = tmp[lane]; + } + + target_block_row += num_warps_in_grid; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl new file mode 100644 index 000000000..41d46e381 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl @@ -0,0 +1,68 @@ +/// ILU apply part 1: forward substitution. +/// Solves L*x=y where L is a lower triangular sparse blocked matrix. +/// Here, L is inside a normal, square matrix. +/// In this case, diagIndex indicates where the rows of L end. +__kernel void ILU_apply1( + __global const double *LUvals, + __global const unsigned int *LUcols, + __global const unsigned int *LUrows, + __global const int *diagIndex, + __global const double *y, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) +{ + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx = get_global_id(0); + unsigned int target_block_row = idx / warpsize; + target_block_row += nodesPerColorPrefix[color]; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + const unsigned int first_block = LUrows[target_block_row]; + const unsigned int last_block = diagIndex[target_block_row]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + if(lane < bs){ + local_out = y[target_block_row*bs+lane]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + const unsigned int row = target_block_row*bs + lane; + x[row] = tmp[lane]; + } + + target_block_row += num_warps_in_grid; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl new file mode 100644 index 000000000..315488f58 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl @@ -0,0 +1,75 @@ +/// ILU apply part 2: backward substitution. +/// Solves U*x=y where U is an upper triangular sparse blocked matrix. +/// Here, U is it's own BSR matrix. +__kernel void ILU_apply2( + __global const double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global const int *diagIndex, + __global const double *invDiagVals, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) +{ + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx_g = get_global_id(0); + unsigned int target_block_row = idx_g / warpsize; + target_block_row += nodesPerColorPrefix[color]; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + const unsigned int first_block = LUrows[target_block_row]; + const unsigned int last_block = LUrows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + if(lane < bs){ + const unsigned int row = target_block_row*bs+lane; + local_out = x[row]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + local_out = tmp[lane]; + + if(lane < bs){ + tmp[lane + bs*idx_t/warpsize] = local_out; + double sum = 0.0; + for(int i = 0; i < bs; ++i){ + sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; + } + + const unsigned int row = target_block_row*bs + lane; + x[row] = sum; + } + + target_block_row += num_warps_in_grid; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl new file mode 100644 index 000000000..f888b826e --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl @@ -0,0 +1,77 @@ +/// ILU apply part 2: backward substitution. +/// Solves U*x=y where U is an upper triangular sparse blocked matrix. +/// Here, U is inside a normal, square matrix. +/// In this case diagIndex indicates where the rows of U start. +__kernel void ILU_apply2( + __global const double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global const int *diagIndex, + __global const double *invDiagVals, + __global double *x, + __global const unsigned int *nodesPerColorPrefix, + const unsigned int color, + const unsigned int block_size, + __local double *tmp) +{ + const unsigned int warpsize = 32; + const unsigned int bs = block_size; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int idx_g = get_global_id(0); + unsigned int target_block_row = idx_g / warpsize; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + target_block_row += nodesPerColorPrefix[color]; + + while(target_block_row < nodesPerColorPrefix[color+1]){ + const unsigned int first_block = diagIndex[target_block_row] + 1; + const unsigned int last_block = LUrows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + if(lane < bs){ + const unsigned int row = target_block_row*bs+lane; + local_out = x[row]; + } + for(; block < last_block; block += num_blocks_per_warp){ + const double x_elem = x[LUcols[block]*bs + c]; + const double A_elem = LUvals[block*bs*bs + c + r*bs]; + local_out -= x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + local_out = tmp[lane]; + + if(lane < bs){ + tmp[lane + bs*idx_t/warpsize] = local_out; + double sum = 0.0; + for(int i = 0; i < bs; ++i){ + sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; + } + + const unsigned int row = target_block_row*bs + lane; + x[row] = sum; + } + + target_block_row += num_warps_in_grid; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl new file mode 100644 index 000000000..0b78e5c96 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl @@ -0,0 +1,137 @@ +// a = a - (b * c) +__kernel void block_mult_sub(__global double *a, __local double *b, __global double *c) +{ + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += b[block_size * row + k] * c[block_size * k + col]; + } + a[block_size * row + col] -= temp; + } +} + +// c = a * b +__kernel void block_mult(__global double *a, __global double *b, __local double *c) +{ + const unsigned int block_size = 3; + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < block_size * block_size){ + const unsigned int row = thread_id_in_hwarp / block_size; + const unsigned int col = thread_id_in_hwarp % block_size; + double temp = 0.0; + for (unsigned int k = 0; k < block_size; k++) { + temp += a[block_size * row + k] * b[block_size * k + col]; + } + c[block_size * row + col] = temp; + } +} + +// invert 3x3 matrix +__kernel void inverter(__global double *matrix, __global double *inverse) +{ + const unsigned int block_size = 3; + const unsigned int bs = block_size; // rename to shorter name + const unsigned int hwarp_size = 16; + const unsigned int idx_t = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) + if(thread_id_in_hwarp < bs * bs){ + double t4 = matrix[0] * matrix[4]; + double t6 = matrix[0] * matrix[5]; + double t8 = matrix[1] * matrix[3]; + double t10 = matrix[2] * matrix[3]; + double t12 = matrix[1] * matrix[6]; + double t14 = matrix[2] * matrix[6]; + + double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + + t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); + double t17 = 1.0 / det; + + const unsigned int r = thread_id_in_hwarp / bs; + const unsigned int c = thread_id_in_hwarp % bs; + const unsigned int r1 = (r+1) % bs; + const unsigned int c1 = (c+1) % bs; + const unsigned int r2 = (r+bs-1) % bs; + const unsigned int c2 = (c+bs-1) % bs; + inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17; + } +} + +/// Exact ilu decomposition kernel +/// The kernel takes a full BSR matrix and performs inplace ILU decomposition +__kernel void ilu_decomp(const unsigned int firstRow, + const unsigned int lastRow, + __global double *LUvals, + __global const int *LUcols, + __global const int *LUrows, + __global double *invDiagVals, + __global int *diagIndex, + const unsigned int Nb, + __local double *pivot) +{ + const unsigned int bs = 3; + const unsigned int hwarp_size = 16; + const unsigned int work_group_size = get_local_size(0); + const unsigned int work_group_id = get_group_id(0); + const unsigned int num_groups = get_num_groups(0); + const unsigned int hwarps_per_group = work_group_size / hwarp_size; + const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group + const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads) + const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size; + const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it + // every workitem in a hwarp has the same lmem_offset + + // go through all rows + for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) + { + int iRowStart = LUrows[i]; + int iRowEnd = LUrows[i + 1]; + + // go through all elements of the row + for (int ij = iRowStart; ij < iRowEnd; ij++) { + int j = LUcols[ij]; + + if (j < i) { + // calculate the pivot of this row + block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); + + // copy pivot + if (thread_id_in_hwarp < bs * bs) { + LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp]; + } + + int jRowEnd = LUrows[j + 1]; + int jk = diagIndex[j] + 1; + int ik = ij + 1; + // subtract that row scaled by the pivot from this row. + while (ik < iRowEnd && jk < jRowEnd) { + if (LUcols[ik] == LUcols[jk]) { + block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs); + ik++; + jk++; + } else { + if (LUcols[ik] < LUcols[jk]) + { ik++; } + else + { jk++; } + } + } + } + } + + // store the inverse in the diagonal + inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); + + // copy inverse + if (thread_id_in_hwarp < bs * bs) { + LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; + } + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/add_coarse_pressure_correction.cl b/opm/simulators/linalg/bda/opencl/kernels/add_coarse_pressure_correction.cl new file mode 100644 index 000000000..c83f3c41f --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/add_coarse_pressure_correction.cl @@ -0,0 +1,17 @@ +/// add the coarse pressure solution back to the finer, complete solution +/// every workitem handles one blockrow +__kernel void add_coarse_pressure_correction( + __global const double *coarse_x, + __global double *fine_x, + const unsigned int pressure_idx, + const unsigned int Nb) +{ + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int block_size = 3; + unsigned int target_block_row = get_global_id(0); + + while(target_block_row < Nb){ + fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row]; + target_block_row += NUM_THREADS; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/axpy.cl b/opm/simulators/linalg/bda/opencl/kernels/axpy.cl new file mode 100644 index 000000000..e62c2890f --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/axpy.cl @@ -0,0 +1,15 @@ +/// axpy kernel: a = a + alpha * b +__kernel void axpy( + __global double *in, + const double a, + __global double *out, + const int N) +{ + unsigned int NUM_THREADS = get_global_size(0); + int idx = get_global_id(0); + + while(idx < N){ + out[idx] += a * in[idx]; + idx += NUM_THREADS; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/custom.cl b/opm/simulators/linalg/bda/opencl/kernels/custom.cl new file mode 100644 index 000000000..54236fba9 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/custom.cl @@ -0,0 +1,22 @@ +/// Custom kernel: combines some bicgstab vector operations into 1 +/// p = (p - omega * v) * beta + r +__kernel void custom( + __global double *p, + __global double *v, + __global double *r, + const double omega, + const double beta, + const int N) +{ + const unsigned int NUM_THREADS = get_global_size(0); + unsigned int idx = get_global_id(0); + + while(idx < N){ + double res = p[idx]; + res -= omega * v[idx]; + res *= beta; + res += r[idx]; + p[idx] = res; + idx += NUM_THREADS; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/dot_1.cl b/opm/simulators/linalg/bda/opencl/kernels/dot_1.cl new file mode 100644 index 000000000..1bbfec68a --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/dot_1.cl @@ -0,0 +1,37 @@ +/// returns partial sums, instead of the final dot product +/// partial sums are added on CPU +__kernel void dot_1( + __global double *in1, + __global double *in2, + __global double *out, + const unsigned int N, + __local double *tmp) +{ + unsigned int tid = get_local_id(0); + unsigned int bsize = get_local_size(0); + unsigned int bid = get_global_id(0) / bsize; + unsigned int i = get_global_id(0); + unsigned int NUM_THREADS = get_global_size(0); + + double sum = 0.0; + while(i < N){ + sum += in1[i] * in2[i]; + i += NUM_THREADS; + } + tmp[tid] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) + { + if (tid < s) + { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) out[get_group_id(0)] = tmp[0]; +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/full_to_pressure_restriction.cl b/opm/simulators/linalg/bda/opencl/kernels/full_to_pressure_restriction.cl new file mode 100644 index 000000000..147d8bc03 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/full_to_pressure_restriction.cl @@ -0,0 +1,22 @@ +/// transform blocked vector to scalar vector using pressure-weights +/// every workitem handles one blockrow +__kernel void full_to_pressure_restriction( + __global const double *fine_y, + __global const double *weights, + __global double *coarse_y, + const unsigned int Nb) +{ + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int block_size = 3; + unsigned int target_block_row = get_global_id(0); + + while(target_block_row < Nb){ + double sum = 0.0; + unsigned int idx = block_size * target_block_row; + for (unsigned int i = 0; i < block_size; ++i) { + sum += fine_y[idx + i] * weights[idx + i]; + } + coarse_y[target_block_row] = sum; + target_block_row += NUM_THREADS; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/norm.cl b/opm/simulators/linalg/bda/opencl/kernels/norm.cl new file mode 100644 index 000000000..2a32c6db5 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/norm.cl @@ -0,0 +1,36 @@ +/// returns partial sums, instead of the final norm +/// the square root must be computed on CPU +__kernel void norm( + __global double *in, + __global double *out, + const unsigned int N, + __local double *tmp) +{ + unsigned int tid = get_local_id(0); + unsigned int bsize = get_local_size(0); + unsigned int bid = get_global_id(0) / bsize; + unsigned int i = get_global_id(0); + unsigned int NUM_THREADS = get_global_size(0); + + double local_sum = 0.0; + while(i < N){ + local_sum += in[i] * in[i]; + i += NUM_THREADS; + } + tmp[tid] = local_sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + // do reduction in shared mem + for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) + { + if (tid < s) + { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // write result for this block to global mem + if (tid == 0) out[get_group_id(0)] = tmp[0]; +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/prolongate_vector.cl b/opm/simulators/linalg/bda/opencl/kernels/prolongate_vector.cl new file mode 100644 index 000000000..e23ab6599 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/prolongate_vector.cl @@ -0,0 +1,16 @@ +/// prolongate vector during amg cycle +/// every workitem handles one row +__kernel void prolongate_vector( + __global const double *in, + __global double *out, + __global const int *cols, + const unsigned int N) +{ + const unsigned int NUM_THREADS = get_global_size(0); + unsigned int row = get_global_id(0); + + while(row < N){ + out[row] += in[cols[row]]; + row += NUM_THREADS; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/residual.cl b/opm/simulators/linalg/bda/opencl/kernels/residual.cl new file mode 100644 index 000000000..4f64bf8bb --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/residual.cl @@ -0,0 +1,50 @@ +/// res = rhs - mat * x +/// algorithm based on: +/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, +/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 +__kernel void residual( + __global const double *vals, + __global const int *cols, + __global const int *rows, + const int N, + __global const double *x, + __global const double *rhs, + __global double *out, + __local double *tmp) +{ + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_workgroups = get_num_groups(0); + + int row = idx_b; + + while (row < N) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + int rowLength = rowEnd - rowStart; + double local_sum = 0.0; + for (int j = rowStart + idx_t; j < rowEnd; j += bsize) { + int col = cols[j]; + local_sum += vals[j] * x[col]; + } + + tmp[idx_t] = local_sum; + barrier(CLK_LOCAL_MEM_FENCE); + + int offset = bsize / 2; + while(offset > 0) { + if (idx_t < offset) { + tmp[idx_t] += tmp[idx_t + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + offset = offset / 2; + } + + if (idx_t == 0) { + out[row] = rhs[row] - tmp[idx_t]; + } + + row += num_workgroups; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl b/opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl new file mode 100644 index 000000000..a9f6c0c9c --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl @@ -0,0 +1,68 @@ +/// res = rhs - mat * x +/// algorithm based on: +/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, +/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 +__kernel void residual_blocked( + __global const double *vals, + __global const int *cols, + __global const int *rows, + const int Nb, + __global const double *x, + __global const double *rhs, + __global double *out, + const unsigned int block_size, + __local double *tmp) +{ + const unsigned int warpsize = 32; + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + unsigned int idx = idx_b * bsize + idx_t; + const unsigned int bs = block_size; + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int target_block_row = idx / warpsize; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + // for 3x3 blocks: + // num_active_threads: 27 + // num_blocks_per_warp: 3 + + while(target_block_row < Nb){ + unsigned int first_block = rows[target_block_row]; + unsigned int last_block = rows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + for(; block < last_block; block += num_blocks_per_warp){ + double x_elem = x[cols[block]*bs + c]; + double A_elem = vals[block*bs*bs + c + r*bs]; + local_out += x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + unsigned int row = target_block_row*bs + lane; + out[row] = rhs[row] - tmp[lane]; + } + target_block_row += num_warps_in_grid; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/scale.cl b/opm/simulators/linalg/bda/opencl/kernels/scale.cl new file mode 100644 index 000000000..3e28d90b6 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/scale.cl @@ -0,0 +1,14 @@ +/// scale vector with scalar: a = a * alpha +__kernel void scale( + __global double *vec, + const double a, + const int N) +{ + unsigned int NUM_THREADS = get_global_size(0); + int idx = get_global_id(0); + + while(idx < N){ + vec[idx] *= a; + idx += NUM_THREADS; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/spmv.cl b/opm/simulators/linalg/bda/opencl/kernels/spmv.cl new file mode 100644 index 000000000..30894a89d --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/spmv.cl @@ -0,0 +1,49 @@ +/// b = mat * x +/// algorithm based on: +/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, +/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 +__kernel void spmv( + __global const double *vals, + __global const int *cols, + __global const int *rows, + const int N, + __global const double *x, + __global double *out, + __local double *tmp) +{ + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_workgroups = get_num_groups(0); + + int row = idx_b; + + while (row < N) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + int rowLength = rowEnd - rowStart; + double local_sum = 0.0; + for (int j = rowStart + idx_t; j < rowEnd; j += bsize) { + int col = cols[j]; + local_sum += vals[j] * x[col]; + } + + tmp[idx_t] = local_sum; + barrier(CLK_LOCAL_MEM_FENCE); + + int offset = bsize / 2; + while(offset > 0) { + if (idx_t < offset) { + tmp[idx_t] += tmp[idx_t + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + offset = offset / 2; + } + + if (idx_t == 0) { + out[row] = tmp[idx_t]; + } + + row += num_workgroups; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl b/opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl new file mode 100644 index 000000000..6cfaaeff1 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl @@ -0,0 +1,67 @@ +/// b = mat * x +/// algorithm based on: +/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, +/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 +__kernel void spmv_blocked( + __global const double *vals, + __global const int *cols, + __global const int *rows, + const int Nb, + __global const double *x, + __global double *out, + const unsigned int block_size, + __local double *tmp) +{ + const unsigned int warpsize = 32; + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + unsigned int idx = idx_b * bsize + idx_t; + const unsigned int bs = block_size; + const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; + const unsigned int num_blocks_per_warp = warpsize/bs/bs; + const unsigned int NUM_THREADS = get_global_size(0); + const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; + unsigned int target_block_row = idx / warpsize; + const unsigned int lane = idx_t % warpsize; + const unsigned int c = (lane / bs) % bs; + const unsigned int r = lane % bs; + + // for 3x3 blocks: + // num_active_threads: 27 + // num_blocks_per_warp: 3 + + while(target_block_row < Nb){ + unsigned int first_block = rows[target_block_row]; + unsigned int last_block = rows[target_block_row+1]; + unsigned int block = first_block + lane / (bs*bs); + double local_out = 0.0; + + if(lane < num_active_threads){ + for(; block < last_block; block += num_blocks_per_warp){ + double x_elem = x[cols[block]*bs + c]; + double A_elem = vals[block*bs*bs + c + r*bs]; + local_out += x_elem * A_elem; + } + } + + // do reduction in shared mem + tmp[lane] = local_out; + barrier(CLK_LOCAL_MEM_FENCE); + + for(unsigned int offset = 3; offset <= 24; offset <<= 1) + { + if (lane + offset < warpsize) + { + tmp[lane] += tmp[lane + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if(lane < bs){ + unsigned int row = target_block_row*bs + lane; + out[row] = tmp[lane]; + } + target_block_row += num_warps_in_grid; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl b/opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl new file mode 100644 index 000000000..b954407bc --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl @@ -0,0 +1,48 @@ +/// algorithm based on: +/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, +/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 +__kernel void spmv_noreset( + __global const double *vals, + __global const int *cols, + __global const int *rows, + const int N, + __global const double *x, + __global double *out, + __local double *tmp) +{ + const unsigned int bsize = get_local_size(0); + const unsigned int idx_b = get_global_id(0) / bsize; + const unsigned int idx_t = get_local_id(0); + const unsigned int num_workgroups = get_num_groups(0); + + int row = idx_b; + + while (row < N) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + int rowLength = rowEnd - rowStart; + double local_sum = 0.0; + for (int j = rowStart + idx_t; j < rowEnd; j += bsize) { + int col = cols[j]; + local_sum += vals[j] * x[col]; + } + + tmp[idx_t] = local_sum; + barrier(CLK_LOCAL_MEM_FENCE); + + int offset = bsize / 2; + while(offset > 0) { + if (idx_t < offset) { + tmp[idx_t] += tmp[idx_t + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + offset = offset / 2; + } + + if (idx_t == 0) { + out[row] += tmp[idx_t]; + } + + row += num_workgroups; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl new file mode 100644 index 000000000..a4269e685 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl @@ -0,0 +1,76 @@ +/// In this kernel there is reordering: the B/Ccols do not correspond with the x/y vector +/// the x/y vector is reordered, using toOrder to address that +__kernel void stdwell_apply( + __global const double *Cnnzs, + __global const double *Dnnzs, + __global const double *Bnnzs, + __global const int *Ccols, + __global const int *Bcols, + __global const double *x, + __global double *y, + __global const int *toOrder, + const unsigned int dim, + const unsigned int dim_wells, + __global const unsigned int *val_pointers, + __local double *localSum, + __local double *z1, + __local double *z2) +{ + int wgId = get_group_id(0); + int wiId = get_local_id(0); + int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; + int valsPerBlock = dim*dim_wells; + int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; + int numBlocksPerWarp = get_local_size(0)/valsPerBlock; + int c = wiId % dim; + int r = (wiId/dim) % dim_wells; + double temp; + + barrier(CLK_LOCAL_MEM_FENCE); + + localSum[wiId] = 0; + if(wiId < numActiveWorkItems){ + int b = wiId/valsPerBlock + val_pointers[wgId]; + while(b < valSize + val_pointers[wgId]){ + int colIdx = toOrder[Bcols[b]]; + localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; + b += numBlocksPerWarp; + } + + if(wiId < valsPerBlock){ + localSum[wiId] += localSum[wiId + valsPerBlock]; + } + + b = wiId/valsPerBlock + val_pointers[wgId]; + + if(c == 0 && wiId < valsPerBlock){ + for(unsigned int stride = 2; stride > 0; stride >>= 1){ + localSum[wiId] += localSum[wiId + stride]; + } + z1[r] = localSum[wiId]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim_wells){ + temp = 0.0; + for(unsigned int i = 0; i < dim_wells; ++i){ + temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; + } + z2[wiId] = temp; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim*valSize){ + temp = 0.0; + int bb = wiId/dim + val_pointers[wgId]; + for (unsigned int j = 0; j < dim_wells; ++j){ + temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; + } + + int colIdx = toOrder[Ccols[bb]]; + y[colIdx*dim + c] -= temp; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl new file mode 100644 index 000000000..885a1d0c0 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl @@ -0,0 +1,74 @@ +/// Applies sdtwells without reordering +__kernel void stdwell_apply_no_reorder( + __global const double *Cnnzs, + __global const double *Dnnzs, + __global const double *Bnnzs, + __global const int *Ccols, + __global const int *Bcols, + __global const double *x, + __global double *y, + const unsigned int dim, + const unsigned int dim_wells, + __global const unsigned int *val_pointers, + __local double *localSum, + __local double *z1, + __local double *z2) +{ + int wgId = get_group_id(0); + int wiId = get_local_id(0); + int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; + int valsPerBlock = dim*dim_wells; + int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; + int numBlocksPerWarp = get_local_size(0)/valsPerBlock; + int c = wiId % dim; + int r = (wiId/dim) % dim_wells; + double temp; + + barrier(CLK_LOCAL_MEM_FENCE); + + localSum[wiId] = 0; + if(wiId < numActiveWorkItems){ + int b = wiId/valsPerBlock + val_pointers[wgId]; + while(b < valSize + val_pointers[wgId]){ + int colIdx = Bcols[b]; + localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; + b += numBlocksPerWarp; + } + + if(wiId < valsPerBlock){ + localSum[wiId] += localSum[wiId + valsPerBlock]; + } + + b = wiId/valsPerBlock + val_pointers[wgId]; + + if(c == 0 && wiId < valsPerBlock){ + for(unsigned int stride = 2; stride > 0; stride >>= 1){ + localSum[wiId] += localSum[wiId + stride]; + } + z1[r] = localSum[wiId]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim_wells){ + temp = 0.0; + for(unsigned int i = 0; i < dim_wells; ++i){ + temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; + } + z2[wiId] = temp; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim*valSize){ + temp = 0.0; + int bb = wiId/dim + val_pointers[wgId]; + for (unsigned int j = 0; j < dim_wells; ++j){ + temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; + } + + int colIdx = Ccols[bb]; + y[colIdx*dim + c] -= temp; + } +} diff --git a/opm/simulators/linalg/bda/opencl/kernels/vmul.cl b/opm/simulators/linalg/bda/opencl/kernels/vmul.cl new file mode 100644 index 000000000..93b9153c5 --- /dev/null +++ b/opm/simulators/linalg/bda/opencl/kernels/vmul.cl @@ -0,0 +1,17 @@ +/// multiply vector with another vector and a scalar, element-wise +/// add result to a third vector +__kernel void vmul( + const double alpha, + __global double const *in1, + __global double const *in2, + __global double *out, + const int N) +{ + unsigned int NUM_THREADS = get_global_size(0); + int idx = get_global_id(0); + + while(idx < N){ + out[idx] += alpha * in1[idx] * in2[idx]; + idx += NUM_THREADS; + } +} diff --git a/opm/simulators/linalg/bda/openclKernels.cpp b/opm/simulators/linalg/bda/openclKernels.cpp index 90553a0b1..e09b8ac3e 100644 --- a/opm/simulators/linalg/bda/openclKernels.cpp +++ b/opm/simulators/linalg/bda/openclKernels.cpp @@ -69,12 +69,8 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B) return A / B + (A % B > 0); } -void add_kernel_source(cl::Program::Sources &sources, const std::string &source) { - sources.emplace_back(source); -} - -void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector& devices, int verbosity_) { - +void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector& devices, int verbosity_) +{ if (initialized) { OpmLog::debug("Warning OpenclKernels is already initialized"); return; @@ -84,49 +80,30 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve verbosity = verbosity_; cl::Program::Sources sources; - const std::string& axpy_s = get_axpy_source(); - add_kernel_source(sources, axpy_s); - const std::string& scale_s = get_scale_source(); - add_kernel_source(sources, scale_s); - const std::string& vmul_s = get_vmul_source(); - add_kernel_source(sources, vmul_s); - const std::string& dot_1_s = get_dot_1_source(); - add_kernel_source(sources, dot_1_s); - const std::string& norm_s = get_norm_source(); - add_kernel_source(sources, norm_s); - const std::string& custom_s = get_custom_source(); - add_kernel_source(sources, custom_s); - const std::string& full_to_pressure_restriction_s = get_full_to_pressure_restriction_source(); - add_kernel_source(sources, full_to_pressure_restriction_s); - const std::string& add_coarse_pressure_correction_s = get_add_coarse_pressure_correction_source(); - add_kernel_source(sources, add_coarse_pressure_correction_s); - const std::string& prolongate_vector_s = get_prolongate_vector_source(); - add_kernel_source(sources, prolongate_vector_s); - const std::string& spmv_blocked_s = get_blocked_matrix_operation_source(matrix_operation::spmv_op); - add_kernel_source(sources, spmv_blocked_s); - const std::string& spmv_s = get_matrix_operation_source(matrix_operation::spmv_op, true); - add_kernel_source(sources, spmv_s); - const std::string& spmv_noreset_s = get_matrix_operation_source(matrix_operation::spmv_op, false); - add_kernel_source(sources, spmv_noreset_s); - const std::string& residual_blocked_s = get_blocked_matrix_operation_source(matrix_operation::residual_op); - add_kernel_source(sources, residual_blocked_s); - const std::string& residual_s = get_matrix_operation_source(matrix_operation::residual_op); - add_kernel_source(sources, residual_s); + sources.emplace_back(axpy_str); + sources.emplace_back(scale_str); + sources.emplace_back(vmul_str); + sources.emplace_back(dot_1_str); + sources.emplace_back(norm_str); + sources.emplace_back(custom_str); + sources.emplace_back(full_to_pressure_restriction_str); + sources.emplace_back(add_coarse_pressure_correction_str); + sources.emplace_back(prolongate_vector_str); + sources.emplace_back(spmv_blocked_str); + sources.emplace_back(spmv_str); + sources.emplace_back(spmv_noreset_str); + sources.emplace_back(residual_blocked_str); + sources.emplace_back(residual_str); #if CHOW_PATEL - bool ilu_operate_on_full_matrix = false; + sources.emplace_back(ILU_apply1_str); + sources.emplace_back(ILU_apply2_str); #else - bool ilu_operate_on_full_matrix = true; + sources.emplace_back(ILU_apply1_fm_str); + sources.emplace_back(ILU_apply2_fm_str); #endif - const std::string& ILU_apply1_s = get_ILU_apply1_source(ilu_operate_on_full_matrix); - add_kernel_source(sources, ILU_apply1_s); - const std::string& ILU_apply2_s = get_ILU_apply2_source(ilu_operate_on_full_matrix); - add_kernel_source(sources, ILU_apply2_s); - const std::string& stdwell_apply_s = get_stdwell_apply_source(true); - add_kernel_source(sources, stdwell_apply_s); - const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_source(false); - add_kernel_source(sources, stdwell_apply_no_reorder_s); - const std::string& ilu_decomp_s = get_ilu_decomp_source(); - add_kernel_source(sources, ilu_decomp_s); + sources.emplace_back(stdwell_apply_str); + sources.emplace_back(stdwell_apply_no_reorder_str); + sources.emplace_back(ILU_decomp_str); cl::Program program = cl::Program(*context, sources); program.build(devices); @@ -158,9 +135,6 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve initialized = true; } // end get_opencl_kernels() - - - double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) { const unsigned int work_group_size = 256; @@ -387,7 +361,6 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row } } - void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size) { const unsigned int work_group_size = 32; @@ -406,7 +379,6 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& r } } - void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size) { const unsigned int work_group_size = 32; @@ -488,808 +460,5 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe } } - - std::string OpenclKernels::get_axpy_source() { - return R"( - __kernel void axpy( - __global double *in, - const double a, - __global double *out, - const int N) - { - unsigned int NUM_THREADS = get_global_size(0); - int idx = get_global_id(0); - - while(idx < N){ - out[idx] += a * in[idx]; - idx += NUM_THREADS; - } - } - )"; - } - - - // scale vector with scalar - std::string OpenclKernels::get_scale_source() { - return R"( - __kernel void scale( - __global double *vec, - const double a, - const int N) - { - unsigned int NUM_THREADS = get_global_size(0); - int idx = get_global_id(0); - - while(idx < N){ - vec[idx] *= a; - idx += NUM_THREADS; - } - } - )"; - } - - // multiply vector with another vector and a scalar, element-wise - // add result to a third vector - std::string OpenclKernels::get_vmul_source() { - return R"( - __kernel void vmul( - const double alpha, - __global double const *in1, - __global double const *in2, - __global double *out, - const int N) - { - unsigned int NUM_THREADS = get_global_size(0); - int idx = get_global_id(0); - - while(idx < N){ - out[idx] += alpha * in1[idx] * in2[idx]; - idx += NUM_THREADS; - } - } - )"; - } - - - // returns partial sums, instead of the final dot product - std::string OpenclKernels::get_dot_1_source() { - return R"( - __kernel void dot_1( - __global double *in1, - __global double *in2, - __global double *out, - const unsigned int N, - __local double *tmp) - { - unsigned int tid = get_local_id(0); - unsigned int bsize = get_local_size(0); - unsigned int bid = get_global_id(0) / bsize; - unsigned int i = get_global_id(0); - unsigned int NUM_THREADS = get_global_size(0); - - double sum = 0.0; - while(i < N){ - sum += in1[i] * in2[i]; - i += NUM_THREADS; - } - tmp[tid] = sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - // do reduction in shared mem - for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) - { - if (tid < s) - { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // write result for this block to global mem - if (tid == 0) out[get_group_id(0)] = tmp[0]; - } - )"; - } - - - // returns partial sums, instead of the final norm - // the square root must be computed on CPU - std::string OpenclKernels::get_norm_source() { - return R"( - __kernel void norm( - __global double *in, - __global double *out, - const unsigned int N, - __local double *tmp) - { - unsigned int tid = get_local_id(0); - unsigned int bsize = get_local_size(0); - unsigned int bid = get_global_id(0) / bsize; - unsigned int i = get_global_id(0); - unsigned int NUM_THREADS = get_global_size(0); - - double local_sum = 0.0; - while(i < N){ - local_sum += in[i] * in[i]; - i += NUM_THREADS; - } - tmp[tid] = local_sum; - - barrier(CLK_LOCAL_MEM_FENCE); - - // do reduction in shared mem - for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) - { - if (tid < s) - { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // write result for this block to global mem - if (tid == 0) out[get_group_id(0)] = tmp[0]; - } - )"; - } - - - // p = (p - omega * v) * beta + r - std::string OpenclKernels::get_custom_source() { - return R"( - __kernel void custom( - __global double *p, - __global double *v, - __global double *r, - const double omega, - const double beta, - const int N) - { - const unsigned int NUM_THREADS = get_global_size(0); - unsigned int idx = get_global_id(0); - - while(idx < N){ - double res = p[idx]; - res -= omega * v[idx]; - res *= beta; - res += r[idx]; - p[idx] = res; - idx += NUM_THREADS; - } - } - )"; - } - - - // transform blocked vector to scalar vector using pressure-weights - // every workitem handles one blockrow - std::string OpenclKernels::get_full_to_pressure_restriction_source() { - return R"( - __kernel void full_to_pressure_restriction( - __global const double *fine_y, - __global const double *weights, - __global double *coarse_y, - const unsigned int Nb) - { - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int block_size = 3; - unsigned int target_block_row = get_global_id(0); - - while(target_block_row < Nb){ - double sum = 0.0; - unsigned int idx = block_size * target_block_row; - for (unsigned int i = 0; i < block_size; ++i) { - sum += fine_y[idx + i] * weights[idx + i]; - } - coarse_y[target_block_row] = sum; - target_block_row += NUM_THREADS; - } - } - )"; - } - - // add the coarse pressure solution back to the finer, complete solution - // every workitem handles one blockrow - std::string OpenclKernels::get_add_coarse_pressure_correction_source() { - return R"( - __kernel void add_coarse_pressure_correction( - __global const double *coarse_x, - __global double *fine_x, - const unsigned int pressure_idx, - const unsigned int Nb) - { - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int block_size = 3; - unsigned int target_block_row = get_global_id(0); - - while(target_block_row < Nb){ - fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row]; - target_block_row += NUM_THREADS; - } - } - )"; - } - - // prolongate vector during amg cycle - // every workitem handles one row - std::string OpenclKernels::get_prolongate_vector_source() { - return R"( - __kernel void prolongate_vector( - __global const double *in, - __global double *out, - __global const int *cols, - const unsigned int N) - { - const unsigned int NUM_THREADS = get_global_size(0); - unsigned int row = get_global_id(0); - - while(row < N){ - out[row] += in[cols[row]]; - row += NUM_THREADS; - } - } - )"; - } - - -/// either b = mat * x -/// or res = rhs - mat * x -std::string OpenclKernels::get_blocked_matrix_operation_source(matrix_operation op) { - std::string s; - if (op == matrix_operation::spmv_op) { - s += "__kernel void spmv_blocked("; - } else { - s += "__kernel void residual_blocked("; - } - s += R"(__global const double *vals, - __global const int *cols, - __global const int *rows, - const int Nb, - __global const double *x, - )"; - if (op == matrix_operation::residual_op) { - s += "__global const double *rhs,"; - } - s += R"( - __global double *out, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bsize = get_local_size(0); - const unsigned int idx_b = get_global_id(0) / bsize; - const unsigned int idx_t = get_local_id(0); - unsigned int idx = idx_b * bsize + idx_t; - const unsigned int bs = block_size; - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int target_block_row = idx / warpsize; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - - // for 3x3 blocks: - // num_active_threads: 27 - // num_blocks_per_warp: 3 - - while(target_block_row < Nb){ - unsigned int first_block = rows[target_block_row]; - unsigned int last_block = rows[target_block_row+1]; - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - - if(lane < num_active_threads){ - for(; block < last_block; block += num_blocks_per_warp){ - double x_elem = x[cols[block]*bs + c]; - double A_elem = vals[block*bs*bs + c + r*bs]; - local_out += x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(lane < bs){ - unsigned int row = target_block_row*bs + lane; - )"; - if (op == matrix_operation::spmv_op) { - s += " out[row] = tmp[lane];"; - } else { - s += " out[row] = rhs[row] - tmp[lane];"; - } - s += R"( - } - target_block_row += num_warps_in_grid; - } - } - )"; - return s; -} - - -/// either b = mat * x -/// or res = rhs - mat * x -std::string OpenclKernels::get_matrix_operation_source(matrix_operation op, bool spmv_reset) { - std::string s; - if (op == matrix_operation::spmv_op) { - if (spmv_reset) { - s += "__kernel void spmv("; - } else { - s += "__kernel void spmv_noreset("; - } - } else { - s += "__kernel void residual("; - } - s += R"(__global const double *vals, - __global const int *cols, - __global const int *rows, - const int N, - __global const double *x, - )"; - if (op == matrix_operation::residual_op) { - s += "__global const double *rhs,"; - } - s += R"( - __global double *out, - __local double *tmp) - { - const unsigned int bsize = get_local_size(0); - const unsigned int idx_b = get_global_id(0) / bsize; - const unsigned int idx_t = get_local_id(0); - const unsigned int num_workgroups = get_num_groups(0); - - int row = idx_b; - - while (row < N) { - int rowStart = rows[row]; - int rowEnd = rows[row+1]; - int rowLength = rowEnd - rowStart; - double local_sum = 0.0; - for (int j = rowStart + idx_t; j < rowEnd; j += bsize) { - int col = cols[j]; - local_sum += vals[j] * x[col]; - } - - tmp[idx_t] = local_sum; - barrier(CLK_LOCAL_MEM_FENCE); - - int offset = bsize / 2; - while(offset > 0) { - if (idx_t < offset) { - tmp[idx_t] += tmp[idx_t + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - offset = offset / 2; - } - - if (idx_t == 0) { - )"; - if (op == matrix_operation::spmv_op) { - if (spmv_reset) { - s += " out[row] = tmp[idx_t];"; - } else { - s += " out[row] += tmp[idx_t];"; - } - } else { - s += " out[row] = rhs[row] - tmp[idx_t];"; - } - s += R"( - } - row += num_workgroups; - } - } - )"; - return s; -} - - - std::string OpenclKernels::get_ILU_apply1_source(bool full_matrix) { - std::string s = R"( - __kernel void ILU_apply1( - __global const double *LUvals, - __global const unsigned int *LUcols, - __global const unsigned int *LUrows, - __global const int *diagIndex, - __global const double *y, - __global double *x, - __global const unsigned int *nodesPerColorPrefix, - const unsigned int color, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bs = block_size; - const unsigned int idx_t = get_local_id(0); - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int idx = get_global_id(0); - unsigned int target_block_row = idx / warpsize; - target_block_row += nodesPerColorPrefix[color]; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - - while(target_block_row < nodesPerColorPrefix[color+1]){ - const unsigned int first_block = LUrows[target_block_row]; - )"; - if (full_matrix) { - s += "const unsigned int last_block = diagIndex[target_block_row]; "; - } else { - s += "const unsigned int last_block = LUrows[target_block_row+1]; "; - } - s += R"( - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - if(lane < num_active_threads){ - if(lane < bs){ - local_out = y[target_block_row*bs+lane]; - } - for(; block < last_block; block += num_blocks_per_warp){ - const double x_elem = x[LUcols[block]*bs + c]; - const double A_elem = LUvals[block*bs*bs + c + r*bs]; - local_out -= x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(lane < bs){ - const unsigned int row = target_block_row*bs + lane; - x[row] = tmp[lane]; - } - - target_block_row += num_warps_in_grid; - } - } - )"; - return s; - } - - - std::string OpenclKernels::get_ILU_apply2_source(bool full_matrix) { - std::string s = R"( - __kernel void ILU_apply2( - __global const double *LUvals, - __global const int *LUcols, - __global const int *LUrows, - __global const int *diagIndex, - __global const double *invDiagVals, - __global double *x, - __global const unsigned int *nodesPerColorPrefix, - const unsigned int color, - const unsigned int block_size, - __local double *tmp) - { - const unsigned int warpsize = 32; - const unsigned int bs = block_size; - const unsigned int idx_t = get_local_id(0); - const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; - const unsigned int num_blocks_per_warp = warpsize/bs/bs; - const unsigned int NUM_THREADS = get_global_size(0); - const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; - unsigned int idx_g = get_global_id(0); - unsigned int target_block_row = idx_g / warpsize; - target_block_row += nodesPerColorPrefix[color]; - const unsigned int lane = idx_t % warpsize; - const unsigned int c = (lane / bs) % bs; - const unsigned int r = lane % bs; - - while(target_block_row < nodesPerColorPrefix[color+1]){ - )"; - if (full_matrix) { - s += "const unsigned int first_block = diagIndex[target_block_row] + 1; "; - } else { - s += "const unsigned int first_block = LUrows[target_block_row]; "; - } - s += R"( - const unsigned int last_block = LUrows[target_block_row+1]; - unsigned int block = first_block + lane / (bs*bs); - double local_out = 0.0; - if(lane < num_active_threads){ - if(lane < bs){ - const unsigned int row = target_block_row*bs+lane; - local_out = x[row]; - } - for(; block < last_block; block += num_blocks_per_warp){ - const double x_elem = x[LUcols[block]*bs + c]; - const double A_elem = LUvals[block*bs*bs + c + r*bs]; - local_out -= x_elem * A_elem; - } - } - - // do reduction in shared mem - tmp[lane] = local_out; - barrier(CLK_LOCAL_MEM_FENCE); - - for(unsigned int offset = 3; offset <= 24; offset <<= 1) - { - if (lane + offset < warpsize) - { - tmp[lane] += tmp[lane + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - local_out = tmp[lane]; - - if(lane < bs){ - tmp[lane + bs*idx_t/warpsize] = local_out; - double sum = 0.0; - for(int i = 0; i < bs; ++i){ - sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; - } - - const unsigned int row = target_block_row*bs + lane; - x[row] = sum; - } - - target_block_row += num_warps_in_grid; - } - } - )"; - return s; - } - - std::string OpenclKernels::get_stdwell_apply_source(bool reorder) { - std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder"; - std::string s = "__kernel void " + kernel_name + R"(( - __global const double *Cnnzs, - __global const double *Dnnzs, - __global const double *Bnnzs, - __global const int *Ccols, - __global const int *Bcols, - __global const double *x, - __global double *y, - )"; - if (reorder) { - s += R"(__global const int *toOrder, - )"; - } - s += R"(const unsigned int dim, - const unsigned int dim_wells, - __global const unsigned int *val_pointers, - __local double *localSum, - __local double *z1, - __local double *z2){ - int wgId = get_group_id(0); - int wiId = get_local_id(0); - int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; - int valsPerBlock = dim*dim_wells; - int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; - int numBlocksPerWarp = get_local_size(0)/valsPerBlock; - int c = wiId % dim; - int r = (wiId/dim) % dim_wells; - double temp; - - barrier(CLK_LOCAL_MEM_FENCE); - - localSum[wiId] = 0; - if(wiId < numActiveWorkItems){ - int b = wiId/valsPerBlock + val_pointers[wgId]; - while(b < valSize + val_pointers[wgId]){ - )"; - if (reorder) { - s += "int colIdx = toOrder[Bcols[b]]; "; - } else { - s += "int colIdx = Bcols[b]; "; - } - s += R"( - localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; - b += numBlocksPerWarp; - } - - if(wiId < valsPerBlock){ - localSum[wiId] += localSum[wiId + valsPerBlock]; - } - - b = wiId/valsPerBlock + val_pointers[wgId]; - - if(c == 0 && wiId < valsPerBlock){ - for(unsigned int stride = 2; stride > 0; stride >>= 1){ - localSum[wiId] += localSum[wiId + stride]; - } - z1[r] = localSum[wiId]; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim_wells){ - temp = 0.0; - for(unsigned int i = 0; i < dim_wells; ++i){ - temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; - } - z2[wiId] = temp; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(wiId < dim*valSize){ - temp = 0.0; - int bb = wiId/dim + val_pointers[wgId]; - for (unsigned int j = 0; j < dim_wells; ++j){ - temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; - } - )"; - if (reorder) { - s += "int colIdx = toOrder[Ccols[bb]]; "; - } else { - s += "int colIdx = Ccols[bb]; "; - } - s += R"( - y[colIdx*dim + c] -= temp; - } - } - )"; - return s; - } - - - std::string OpenclKernels::get_ilu_decomp_source() { - return R"( - - // a = a - (b * c) - __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c) - { - const unsigned int block_size = 3; - const unsigned int hwarp_size = 16; - const unsigned int idx_t = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) - if(thread_id_in_hwarp < block_size * block_size){ - const unsigned int row = thread_id_in_hwarp / block_size; - const unsigned int col = thread_id_in_hwarp % block_size; - double temp = 0.0; - for (unsigned int k = 0; k < block_size; k++) { - temp += b[block_size * row + k] * c[block_size * k + col]; - } - a[block_size * row + col] -= temp; - } - } - - // c = a * b - __kernel void block_mult(__global double *a, __global double *b, __local double *c) { - const unsigned int block_size = 3; - const unsigned int hwarp_size = 16; - const unsigned int idx_t = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) - if(thread_id_in_hwarp < block_size * block_size){ - const unsigned int row = thread_id_in_hwarp / block_size; - const unsigned int col = thread_id_in_hwarp % block_size; - double temp = 0.0; - for (unsigned int k = 0; k < block_size; k++) { - temp += a[block_size * row + k] * b[block_size * k + col]; - } - c[block_size * row + col] = temp; - } - } - - // invert 3x3 matrix - __kernel void inverter(__global double *matrix, __global double *inverse) { - const unsigned int block_size = 3; - const unsigned int bs = block_size; // rename to shorter name - const unsigned int hwarp_size = 16; - const unsigned int idx_t = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads) - if(thread_id_in_hwarp < bs * bs){ - double t4 = matrix[0] * matrix[4]; - double t6 = matrix[0] * matrix[5]; - double t8 = matrix[1] * matrix[3]; - double t10 = matrix[2] * matrix[3]; - double t12 = matrix[1] * matrix[6]; - double t14 = matrix[2] * matrix[6]; - - double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] + - t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]); - double t17 = 1.0 / det; - - const unsigned int r = thread_id_in_hwarp / bs; - const unsigned int c = thread_id_in_hwarp % bs; - const unsigned int r1 = (r+1) % bs; - const unsigned int c1 = (c+1) % bs; - const unsigned int r2 = (r+bs-1) % bs; - const unsigned int c2 = (c+bs-1) % bs; - inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17; - } - } - - __kernel void ilu_decomp(const unsigned int firstRow, - const unsigned int lastRow, - __global double *LUvals, - __global const int *LUcols, - __global const int *LUrows, - __global double *invDiagVals, - __global int *diagIndex, - const unsigned int Nb, - __local double *pivot){ - - const unsigned int bs = 3; - const unsigned int hwarp_size = 16; - const unsigned int work_group_size = get_local_size(0); - const unsigned int work_group_id = get_group_id(0); - const unsigned int num_groups = get_num_groups(0); - const unsigned int hwarps_per_group = work_group_size / hwarp_size; - const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group - const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads) - const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size; - const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it - // every workitem in a hwarp has the same lmem_offset - - // go through all rows - for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) - { - int iRowStart = LUrows[i]; - int iRowEnd = LUrows[i + 1]; - - // go through all elements of the row - for (int ij = iRowStart; ij < iRowEnd; ij++) { - int j = LUcols[ij]; - - if (j < i) { - // calculate the pivot of this row - block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); - - // copy pivot - if (thread_id_in_hwarp < bs * bs) { - LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp]; - } - - int jRowEnd = LUrows[j + 1]; - int jk = diagIndex[j] + 1; - int ik = ij + 1; - // subtract that row scaled by the pivot from this row. - while (ik < iRowEnd && jk < jRowEnd) { - if (LUcols[ik] == LUcols[jk]) { - block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs); - ik++; - jk++; - } else { - if (LUcols[ik] < LUcols[jk]) - { ik++; } - else - { jk++; } - } - } - } - } - - // store the inverse in the diagonal - inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); - - // copy inverse - if (thread_id_in_hwarp < bs * bs) { - LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; - } - - } - } - )"; - } - } // namespace Accelerator } // namespace Opm - diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index a90e36394..d03a40c02 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -61,11 +61,6 @@ private: static std::vector tmp; // used as tmp CPU buffer for dot() and norm() static bool initialized; - enum matrix_operation { - spmv_op, - residual_op - }; - static std::unique_ptr > dot_k; static std::unique_ptr > norm_k; static std::unique_ptr > axpy_k; @@ -86,76 +81,34 @@ private: static std::unique_ptr stdwell_apply_no_reorder_k; static std::unique_ptr ilu_decomp_k; - /// Generate string with axpy kernel - /// a = a + alpha * b - static std::string get_axpy_source(); - - /// Generate string with scale kernel - /// a = a * alpha - static std::string get_scale_source(); - - /// multiply vector with another vector and a scalar, element-wise - /// add result to a third vector - static std::string get_vmul_source(); - - /// returns partial sums, instead of the final dot product - /// partial sums are added on CPU - static std::string get_dot_1_source(); - - /// returns partial sums, instead of the final norm - /// the square root must be computed on CPU - static std::string get_norm_source(); - - /// Generate string with custom kernel - /// This kernel combines some ilubicgstab vector operations into 1 - /// p = (p - omega * v) * beta + r - static std::string get_custom_source(); - - /// Transform blocked vector to scalar vector using pressure-weights - static std::string get_full_to_pressure_restriction_source(); - - /// Add the coarse pressure solution back to the finer, complete solution - static std::string get_add_coarse_pressure_correction_source(); - - /// Prolongate a vector during the AMG cycle - static std::string get_prolongate_vector_source(); - - /// b = mat * x - /// algorithm based on: - /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, - /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 - /// or - /// res = rhs - (mat * x) - static std::string get_blocked_matrix_operation_source(matrix_operation op); - static std::string get_matrix_operation_source(matrix_operation op, bool spmv_reset = true); - - /// ILU apply part 1: forward substitution - /// solves L*x=y where L is a lower triangular sparse blocked matrix - /// this L can be it's own BSR matrix (if full_matrix is false), - /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end - /// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not - static std::string get_ILU_apply1_source(bool full_matrix); - - /// ILU apply part 2: backward substitution - /// solves U*x=y where U is an upper triangular sparse blocked matrix - /// this U can be it's own BSR matrix (if full_matrix is false), - /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start - /// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not - static std::string get_ILU_apply2_source(bool full_matrix); - - /// Generate string with the stdwell_apply kernels - /// If reorder is true, the B/Ccols do not correspond with the x/y vector - /// the x/y vector is reordered, use toOrder to address that - /// \param[in] reorder whether the matrix is reordered or not - static std::string get_stdwell_apply_source(bool reorder); - - /// Generate string with the exact ilu decomposition kernel - /// The kernel takes a full BSR matrix and performs inplace ILU decomposition - static std::string get_ilu_decomp_source(); - OpenclKernels(){}; // disable instantiation public: + static const std::string axpy_str; + static const std::string scale_str; + static const std::string vmul_str; + static const std::string dot_1_str; + static const std::string norm_str; + static const std::string custom_str; + static const std::string full_to_pressure_restriction_str; + static const std::string add_coarse_pressure_correction_str; + static const std::string prolongate_vector_str; + static const std::string spmv_blocked_str; + static const std::string spmv_str; + static const std::string spmv_noreset_str; + static const std::string residual_blocked_str; + static const std::string residual_str; +#if CHOW_PATEL + static const std::string ILU_apply1_str; + static const std::string ILU_apply2_str; +#else + static const std::string ILU_apply1_fm_str; + static const std::string ILU_apply2_fm_str; +#endif + static const std::string stdwell_apply_str; + static const std::string stdwell_apply_no_reorder_str; + static const std::string ILU_decomp_str; + static void init(cl::Context *context, cl::CommandQueue *queue, std::vector& devices, int verbosity); static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);