mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #3754 from blattms/clean-opencl-kernels-rebased
Clean opencl kernels rebased (rebased #3749)
This commit is contained in:
commit
994260aaea
@ -287,6 +287,10 @@ macro (prereqs_hook)
|
|||||||
endmacro (prereqs_hook)
|
endmacro (prereqs_hook)
|
||||||
|
|
||||||
macro (sources_hook)
|
macro (sources_hook)
|
||||||
|
if(OPENCL_FOUND)
|
||||||
|
include(opencl-source-provider)
|
||||||
|
list(APPEND opm-simulators_SOURCES ${PROJECT_BINARY_DIR}/clSources.cpp)
|
||||||
|
endif()
|
||||||
endmacro (sources_hook)
|
endmacro (sources_hook)
|
||||||
|
|
||||||
macro (fortran_hook)
|
macro (fortran_hook)
|
||||||
|
51
opencl-source-provider.cmake
Normal file
51
opencl-source-provider.cmake
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
set(BDA_DIR opm/simulators/linalg/bda)
|
||||||
|
set(KERNELS_DIR ${BDA_DIR}/opencl/kernels)
|
||||||
|
|
||||||
|
option(DEBUG_OPENCL_KERNELS_INTEL "Run ocloc to check kernel (works only on Intel)" OFF)
|
||||||
|
if(DEBUG_OPENCL_KERNELS_INTEL)
|
||||||
|
set(DEBUG_OPENCL_DIR ${KERNELS_DIR}/.debug)
|
||||||
|
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E make_directory ${DEBUG_OPENCL_DIR}
|
||||||
|
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(CL_SRC_FILE ${PROJECT_BINARY_DIR}/clSources.cpp)
|
||||||
|
file(WRITE ${CL_SRC_FILE} "// This file is auto-generated. Do not edit!\n\n")
|
||||||
|
file(APPEND ${CL_SRC_FILE} "#include \"${BDA_DIR}/openclKernels.hpp\"\n\n")
|
||||||
|
file(APPEND ${CL_SRC_FILE} "namespace Opm\{\n\n")
|
||||||
|
file(APPEND ${CL_SRC_FILE} "namespace Accelerator\{\n\n")
|
||||||
|
|
||||||
|
file(GLOB CL_LIST "${KERNELS_DIR}/*.cl")
|
||||||
|
|
||||||
|
if(USE_CHOW_PATEL_ILU)
|
||||||
|
list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply1_fm.cl")
|
||||||
|
list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply2_fm.cl")
|
||||||
|
else()
|
||||||
|
list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply1.cl")
|
||||||
|
list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply2.cl")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
foreach(CL ${CL_LIST})
|
||||||
|
get_filename_component(FNAME ${CL} NAME_WE)
|
||||||
|
|
||||||
|
file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n")
|
||||||
|
file(READ "${CL}" CL_CONTENT)
|
||||||
|
file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
|
||||||
|
file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")
|
||||||
|
|
||||||
|
if(DEBUG_OPENCL_KERNELS_INTEL)
|
||||||
|
execute_process(
|
||||||
|
COMMAND ocloc -file ${CL} -device kbl -out_dir ${DEBUG_OPENCL_DIR}
|
||||||
|
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
file(APPEND ${CL_SRC_FILE} "\}\n")
|
||||||
|
file(APPEND ${CL_SRC_FILE} "\}")
|
||||||
|
|
||||||
|
if(DEBUG_OPENCL_KERNELS_INTEL)
|
||||||
|
file(REMOVE_RECURSE ${DEBUG_DIR})
|
||||||
|
endif()
|
68
opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl
Normal file
68
opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
/// ILU apply part 1: forward substitution.
|
||||||
|
/// Solves L*x=y where L is a lower triangular sparse blocked matrix.
|
||||||
|
/// Here, L is it's own BSR matrix.
|
||||||
|
__kernel void ILU_apply1(
|
||||||
|
__global const double *LUvals,
|
||||||
|
__global const unsigned int *LUcols,
|
||||||
|
__global const unsigned int *LUrows,
|
||||||
|
__global const int *diagIndex,
|
||||||
|
__global const double *y,
|
||||||
|
__global double *x,
|
||||||
|
__global const unsigned int *nodesPerColorPrefix,
|
||||||
|
const unsigned int color,
|
||||||
|
const unsigned int block_size,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int warpsize = 32;
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||||
|
unsigned int idx = get_global_id(0);
|
||||||
|
unsigned int target_block_row = idx / warpsize;
|
||||||
|
const unsigned int lane = idx_t % warpsize;
|
||||||
|
const unsigned int c = (lane / bs) % bs;
|
||||||
|
const unsigned int r = lane % bs;
|
||||||
|
|
||||||
|
target_block_row += nodesPerColorPrefix[color];
|
||||||
|
|
||||||
|
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||||
|
const unsigned int first_block = LUrows[target_block_row];
|
||||||
|
const unsigned int last_block = LUrows[target_block_row+1];
|
||||||
|
unsigned int block = first_block + lane / (bs*bs);
|
||||||
|
double local_out = 0.0;
|
||||||
|
|
||||||
|
if(lane < num_active_threads){
|
||||||
|
if(lane < bs){
|
||||||
|
local_out = y[target_block_row*bs+lane];
|
||||||
|
}
|
||||||
|
for(; block < last_block; block += num_blocks_per_warp){
|
||||||
|
const double x_elem = x[LUcols[block]*bs + c];
|
||||||
|
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||||
|
local_out -= x_elem * A_elem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
tmp[lane] = local_out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||||
|
{
|
||||||
|
if (lane + offset < warpsize)
|
||||||
|
{
|
||||||
|
tmp[lane] += tmp[lane + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(lane < bs){
|
||||||
|
const unsigned int row = target_block_row*bs + lane;
|
||||||
|
x[row] = tmp[lane];
|
||||||
|
}
|
||||||
|
|
||||||
|
target_block_row += num_warps_in_grid;
|
||||||
|
}
|
||||||
|
}
|
68
opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
Normal file
68
opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
/// ILU apply part 1: forward substitution.
|
||||||
|
/// Solves L*x=y where L is a lower triangular sparse blocked matrix.
|
||||||
|
/// Here, L is inside a normal, square matrix.
|
||||||
|
/// In this case, diagIndex indicates where the rows of L end.
|
||||||
|
__kernel void ILU_apply1(
|
||||||
|
__global const double *LUvals,
|
||||||
|
__global const unsigned int *LUcols,
|
||||||
|
__global const unsigned int *LUrows,
|
||||||
|
__global const int *diagIndex,
|
||||||
|
__global const double *y,
|
||||||
|
__global double *x,
|
||||||
|
__global const unsigned int *nodesPerColorPrefix,
|
||||||
|
const unsigned int color,
|
||||||
|
const unsigned int block_size,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int warpsize = 32;
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||||
|
unsigned int idx = get_global_id(0);
|
||||||
|
unsigned int target_block_row = idx / warpsize;
|
||||||
|
target_block_row += nodesPerColorPrefix[color];
|
||||||
|
const unsigned int lane = idx_t % warpsize;
|
||||||
|
const unsigned int c = (lane / bs) % bs;
|
||||||
|
const unsigned int r = lane % bs;
|
||||||
|
|
||||||
|
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||||
|
const unsigned int first_block = LUrows[target_block_row];
|
||||||
|
const unsigned int last_block = diagIndex[target_block_row];
|
||||||
|
unsigned int block = first_block + lane / (bs*bs);
|
||||||
|
double local_out = 0.0;
|
||||||
|
|
||||||
|
if(lane < num_active_threads){
|
||||||
|
if(lane < bs){
|
||||||
|
local_out = y[target_block_row*bs+lane];
|
||||||
|
}
|
||||||
|
for(; block < last_block; block += num_blocks_per_warp){
|
||||||
|
const double x_elem = x[LUcols[block]*bs + c];
|
||||||
|
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||||
|
local_out -= x_elem * A_elem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
tmp[lane] = local_out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||||
|
{
|
||||||
|
if (lane + offset < warpsize)
|
||||||
|
{
|
||||||
|
tmp[lane] += tmp[lane + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(lane < bs){
|
||||||
|
const unsigned int row = target_block_row*bs + lane;
|
||||||
|
x[row] = tmp[lane];
|
||||||
|
}
|
||||||
|
|
||||||
|
target_block_row += num_warps_in_grid;
|
||||||
|
}
|
||||||
|
}
|
75
opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl
Normal file
75
opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
/// ILU apply part 2: backward substitution.
|
||||||
|
/// Solves U*x=y where U is an upper triangular sparse blocked matrix.
|
||||||
|
/// Here, U is it's own BSR matrix.
|
||||||
|
__kernel void ILU_apply2(
|
||||||
|
__global const double *LUvals,
|
||||||
|
__global const int *LUcols,
|
||||||
|
__global const int *LUrows,
|
||||||
|
__global const int *diagIndex,
|
||||||
|
__global const double *invDiagVals,
|
||||||
|
__global double *x,
|
||||||
|
__global const unsigned int *nodesPerColorPrefix,
|
||||||
|
const unsigned int color,
|
||||||
|
const unsigned int block_size,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int warpsize = 32;
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||||
|
unsigned int idx_g = get_global_id(0);
|
||||||
|
unsigned int target_block_row = idx_g / warpsize;
|
||||||
|
target_block_row += nodesPerColorPrefix[color];
|
||||||
|
const unsigned int lane = idx_t % warpsize;
|
||||||
|
const unsigned int c = (lane / bs) % bs;
|
||||||
|
const unsigned int r = lane % bs;
|
||||||
|
|
||||||
|
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||||
|
const unsigned int first_block = LUrows[target_block_row];
|
||||||
|
const unsigned int last_block = LUrows[target_block_row+1];
|
||||||
|
unsigned int block = first_block + lane / (bs*bs);
|
||||||
|
double local_out = 0.0;
|
||||||
|
|
||||||
|
if(lane < num_active_threads){
|
||||||
|
if(lane < bs){
|
||||||
|
const unsigned int row = target_block_row*bs+lane;
|
||||||
|
local_out = x[row];
|
||||||
|
}
|
||||||
|
for(; block < last_block; block += num_blocks_per_warp){
|
||||||
|
const double x_elem = x[LUcols[block]*bs + c];
|
||||||
|
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||||
|
local_out -= x_elem * A_elem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
tmp[lane] = local_out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||||
|
{
|
||||||
|
if (lane + offset < warpsize)
|
||||||
|
{
|
||||||
|
tmp[lane] += tmp[lane + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
local_out = tmp[lane];
|
||||||
|
|
||||||
|
if(lane < bs){
|
||||||
|
tmp[lane + bs*idx_t/warpsize] = local_out;
|
||||||
|
double sum = 0.0;
|
||||||
|
for(int i = 0; i < bs; ++i){
|
||||||
|
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned int row = target_block_row*bs + lane;
|
||||||
|
x[row] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
target_block_row += num_warps_in_grid;
|
||||||
|
}
|
||||||
|
}
|
77
opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
Normal file
77
opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
/// ILU apply part 2: backward substitution.
|
||||||
|
/// Solves U*x=y where U is an upper triangular sparse blocked matrix.
|
||||||
|
/// Here, U is inside a normal, square matrix.
|
||||||
|
/// In this case diagIndex indicates where the rows of U start.
|
||||||
|
__kernel void ILU_apply2(
|
||||||
|
__global const double *LUvals,
|
||||||
|
__global const int *LUcols,
|
||||||
|
__global const int *LUrows,
|
||||||
|
__global const int *diagIndex,
|
||||||
|
__global const double *invDiagVals,
|
||||||
|
__global double *x,
|
||||||
|
__global const unsigned int *nodesPerColorPrefix,
|
||||||
|
const unsigned int color,
|
||||||
|
const unsigned int block_size,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int warpsize = 32;
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||||
|
unsigned int idx_g = get_global_id(0);
|
||||||
|
unsigned int target_block_row = idx_g / warpsize;
|
||||||
|
const unsigned int lane = idx_t % warpsize;
|
||||||
|
const unsigned int c = (lane / bs) % bs;
|
||||||
|
const unsigned int r = lane % bs;
|
||||||
|
|
||||||
|
target_block_row += nodesPerColorPrefix[color];
|
||||||
|
|
||||||
|
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||||
|
const unsigned int first_block = diagIndex[target_block_row] + 1;
|
||||||
|
const unsigned int last_block = LUrows[target_block_row+1];
|
||||||
|
unsigned int block = first_block + lane / (bs*bs);
|
||||||
|
double local_out = 0.0;
|
||||||
|
|
||||||
|
if(lane < num_active_threads){
|
||||||
|
if(lane < bs){
|
||||||
|
const unsigned int row = target_block_row*bs+lane;
|
||||||
|
local_out = x[row];
|
||||||
|
}
|
||||||
|
for(; block < last_block; block += num_blocks_per_warp){
|
||||||
|
const double x_elem = x[LUcols[block]*bs + c];
|
||||||
|
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||||
|
local_out -= x_elem * A_elem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
tmp[lane] = local_out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||||
|
{
|
||||||
|
if (lane + offset < warpsize)
|
||||||
|
{
|
||||||
|
tmp[lane] += tmp[lane + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
local_out = tmp[lane];
|
||||||
|
|
||||||
|
if(lane < bs){
|
||||||
|
tmp[lane + bs*idx_t/warpsize] = local_out;
|
||||||
|
double sum = 0.0;
|
||||||
|
for(int i = 0; i < bs; ++i){
|
||||||
|
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned int row = target_block_row*bs + lane;
|
||||||
|
x[row] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
target_block_row += num_warps_in_grid;
|
||||||
|
}
|
||||||
|
}
|
137
opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl
Normal file
137
opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
// a = a - (b * c)
|
||||||
|
__kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
const unsigned int hwarp_size = 16;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
||||||
|
if(thread_id_in_hwarp < block_size * block_size){
|
||||||
|
const unsigned int row = thread_id_in_hwarp / block_size;
|
||||||
|
const unsigned int col = thread_id_in_hwarp % block_size;
|
||||||
|
double temp = 0.0;
|
||||||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||||||
|
temp += b[block_size * row + k] * c[block_size * k + col];
|
||||||
|
}
|
||||||
|
a[block_size * row + col] -= temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// c = a * b
|
||||||
|
__kernel void block_mult(__global double *a, __global double *b, __local double *c)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
const unsigned int hwarp_size = 16;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
||||||
|
if(thread_id_in_hwarp < block_size * block_size){
|
||||||
|
const unsigned int row = thread_id_in_hwarp / block_size;
|
||||||
|
const unsigned int col = thread_id_in_hwarp % block_size;
|
||||||
|
double temp = 0.0;
|
||||||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||||||
|
temp += a[block_size * row + k] * b[block_size * k + col];
|
||||||
|
}
|
||||||
|
c[block_size * row + col] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// invert 3x3 matrix
|
||||||
|
__kernel void inverter(__global double *matrix, __global double *inverse)
|
||||||
|
{
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
const unsigned int bs = block_size; // rename to shorter name
|
||||||
|
const unsigned int hwarp_size = 16;
|
||||||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
||||||
|
if(thread_id_in_hwarp < bs * bs){
|
||||||
|
double t4 = matrix[0] * matrix[4];
|
||||||
|
double t6 = matrix[0] * matrix[5];
|
||||||
|
double t8 = matrix[1] * matrix[3];
|
||||||
|
double t10 = matrix[2] * matrix[3];
|
||||||
|
double t12 = matrix[1] * matrix[6];
|
||||||
|
double t14 = matrix[2] * matrix[6];
|
||||||
|
|
||||||
|
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
|
||||||
|
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
|
||||||
|
double t17 = 1.0 / det;
|
||||||
|
|
||||||
|
const unsigned int r = thread_id_in_hwarp / bs;
|
||||||
|
const unsigned int c = thread_id_in_hwarp % bs;
|
||||||
|
const unsigned int r1 = (r+1) % bs;
|
||||||
|
const unsigned int c1 = (c+1) % bs;
|
||||||
|
const unsigned int r2 = (r+bs-1) % bs;
|
||||||
|
const unsigned int c2 = (c+bs-1) % bs;
|
||||||
|
inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Exact ilu decomposition kernel
|
||||||
|
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
|
||||||
|
__kernel void ilu_decomp(const unsigned int firstRow,
|
||||||
|
const unsigned int lastRow,
|
||||||
|
__global double *LUvals,
|
||||||
|
__global const int *LUcols,
|
||||||
|
__global const int *LUrows,
|
||||||
|
__global double *invDiagVals,
|
||||||
|
__global int *diagIndex,
|
||||||
|
const unsigned int Nb,
|
||||||
|
__local double *pivot)
|
||||||
|
{
|
||||||
|
const unsigned int bs = 3;
|
||||||
|
const unsigned int hwarp_size = 16;
|
||||||
|
const unsigned int work_group_size = get_local_size(0);
|
||||||
|
const unsigned int work_group_id = get_group_id(0);
|
||||||
|
const unsigned int num_groups = get_num_groups(0);
|
||||||
|
const unsigned int hwarps_per_group = work_group_size / hwarp_size;
|
||||||
|
const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group
|
||||||
|
const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads)
|
||||||
|
const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
|
||||||
|
const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it
|
||||||
|
// every workitem in a hwarp has the same lmem_offset
|
||||||
|
|
||||||
|
// go through all rows
|
||||||
|
for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
|
||||||
|
{
|
||||||
|
int iRowStart = LUrows[i];
|
||||||
|
int iRowEnd = LUrows[i + 1];
|
||||||
|
|
||||||
|
// go through all elements of the row
|
||||||
|
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
||||||
|
int j = LUcols[ij];
|
||||||
|
|
||||||
|
if (j < i) {
|
||||||
|
// calculate the pivot of this row
|
||||||
|
block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
|
||||||
|
|
||||||
|
// copy pivot
|
||||||
|
if (thread_id_in_hwarp < bs * bs) {
|
||||||
|
LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
|
||||||
|
}
|
||||||
|
|
||||||
|
int jRowEnd = LUrows[j + 1];
|
||||||
|
int jk = diagIndex[j] + 1;
|
||||||
|
int ik = ij + 1;
|
||||||
|
// subtract that row scaled by the pivot from this row.
|
||||||
|
while (ik < iRowEnd && jk < jRowEnd) {
|
||||||
|
if (LUcols[ik] == LUcols[jk]) {
|
||||||
|
block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
|
||||||
|
ik++;
|
||||||
|
jk++;
|
||||||
|
} else {
|
||||||
|
if (LUcols[ik] < LUcols[jk])
|
||||||
|
{ ik++; }
|
||||||
|
else
|
||||||
|
{ jk++; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// store the inverse in the diagonal
|
||||||
|
inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
|
||||||
|
|
||||||
|
// copy inverse
|
||||||
|
if (thread_id_in_hwarp < bs * bs) {
|
||||||
|
LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,17 @@
|
|||||||
|
/// add the coarse pressure solution back to the finer, complete solution
|
||||||
|
/// every workitem handles one blockrow
|
||||||
|
__kernel void add_coarse_pressure_correction(
|
||||||
|
__global const double *coarse_x,
|
||||||
|
__global double *fine_x,
|
||||||
|
const unsigned int pressure_idx,
|
||||||
|
const unsigned int Nb)
|
||||||
|
{
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
unsigned int target_block_row = get_global_id(0);
|
||||||
|
|
||||||
|
while(target_block_row < Nb){
|
||||||
|
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
|
||||||
|
target_block_row += NUM_THREADS;
|
||||||
|
}
|
||||||
|
}
|
15
opm/simulators/linalg/bda/opencl/kernels/axpy.cl
Normal file
15
opm/simulators/linalg/bda/opencl/kernels/axpy.cl
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
/// axpy kernel: a = a + alpha * b
|
||||||
|
__kernel void axpy(
|
||||||
|
__global double *in,
|
||||||
|
const double a,
|
||||||
|
__global double *out,
|
||||||
|
const int N)
|
||||||
|
{
|
||||||
|
unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
int idx = get_global_id(0);
|
||||||
|
|
||||||
|
while(idx < N){
|
||||||
|
out[idx] += a * in[idx];
|
||||||
|
idx += NUM_THREADS;
|
||||||
|
}
|
||||||
|
}
|
22
opm/simulators/linalg/bda/opencl/kernels/custom.cl
Normal file
22
opm/simulators/linalg/bda/opencl/kernels/custom.cl
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
/// Custom kernel: combines some bicgstab vector operations into 1
|
||||||
|
/// p = (p - omega * v) * beta + r
|
||||||
|
__kernel void custom(
|
||||||
|
__global double *p,
|
||||||
|
__global double *v,
|
||||||
|
__global double *r,
|
||||||
|
const double omega,
|
||||||
|
const double beta,
|
||||||
|
const int N)
|
||||||
|
{
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
unsigned int idx = get_global_id(0);
|
||||||
|
|
||||||
|
while(idx < N){
|
||||||
|
double res = p[idx];
|
||||||
|
res -= omega * v[idx];
|
||||||
|
res *= beta;
|
||||||
|
res += r[idx];
|
||||||
|
p[idx] = res;
|
||||||
|
idx += NUM_THREADS;
|
||||||
|
}
|
||||||
|
}
|
37
opm/simulators/linalg/bda/opencl/kernels/dot_1.cl
Normal file
37
opm/simulators/linalg/bda/opencl/kernels/dot_1.cl
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
/// returns partial sums, instead of the final dot product
|
||||||
|
/// partial sums are added on CPU
|
||||||
|
__kernel void dot_1(
|
||||||
|
__global double *in1,
|
||||||
|
__global double *in2,
|
||||||
|
__global double *out,
|
||||||
|
const unsigned int N,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int bsize = get_local_size(0);
|
||||||
|
unsigned int bid = get_global_id(0) / bsize;
|
||||||
|
unsigned int i = get_global_id(0);
|
||||||
|
unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
|
||||||
|
double sum = 0.0;
|
||||||
|
while(i < N){
|
||||||
|
sum += in1[i] * in2[i];
|
||||||
|
i += NUM_THREADS;
|
||||||
|
}
|
||||||
|
tmp[tid] = sum;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
||||||
|
{
|
||||||
|
if (tid < s)
|
||||||
|
{
|
||||||
|
tmp[tid] += tmp[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) out[get_group_id(0)] = tmp[0];
|
||||||
|
}
|
@ -0,0 +1,22 @@
|
|||||||
|
/// transform blocked vector to scalar vector using pressure-weights
|
||||||
|
/// every workitem handles one blockrow
|
||||||
|
__kernel void full_to_pressure_restriction(
|
||||||
|
__global const double *fine_y,
|
||||||
|
__global const double *weights,
|
||||||
|
__global double *coarse_y,
|
||||||
|
const unsigned int Nb)
|
||||||
|
{
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
unsigned int target_block_row = get_global_id(0);
|
||||||
|
|
||||||
|
while(target_block_row < Nb){
|
||||||
|
double sum = 0.0;
|
||||||
|
unsigned int idx = block_size * target_block_row;
|
||||||
|
for (unsigned int i = 0; i < block_size; ++i) {
|
||||||
|
sum += fine_y[idx + i] * weights[idx + i];
|
||||||
|
}
|
||||||
|
coarse_y[target_block_row] = sum;
|
||||||
|
target_block_row += NUM_THREADS;
|
||||||
|
}
|
||||||
|
}
|
36
opm/simulators/linalg/bda/opencl/kernels/norm.cl
Normal file
36
opm/simulators/linalg/bda/opencl/kernels/norm.cl
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
/// returns partial sums, instead of the final norm
|
||||||
|
/// the square root must be computed on CPU
|
||||||
|
__kernel void norm(
|
||||||
|
__global double *in,
|
||||||
|
__global double *out,
|
||||||
|
const unsigned int N,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int bsize = get_local_size(0);
|
||||||
|
unsigned int bid = get_global_id(0) / bsize;
|
||||||
|
unsigned int i = get_global_id(0);
|
||||||
|
unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
|
||||||
|
double local_sum = 0.0;
|
||||||
|
while(i < N){
|
||||||
|
local_sum += in[i] * in[i];
|
||||||
|
i += NUM_THREADS;
|
||||||
|
}
|
||||||
|
tmp[tid] = local_sum;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
||||||
|
{
|
||||||
|
if (tid < s)
|
||||||
|
{
|
||||||
|
tmp[tid] += tmp[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) out[get_group_id(0)] = tmp[0];
|
||||||
|
}
|
@ -0,0 +1,16 @@
|
|||||||
|
/// prolongate vector during amg cycle
|
||||||
|
/// every workitem handles one row
|
||||||
|
__kernel void prolongate_vector(
|
||||||
|
__global const double *in,
|
||||||
|
__global double *out,
|
||||||
|
__global const int *cols,
|
||||||
|
const unsigned int N)
|
||||||
|
{
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
unsigned int row = get_global_id(0);
|
||||||
|
|
||||||
|
while(row < N){
|
||||||
|
out[row] += in[cols[row]];
|
||||||
|
row += NUM_THREADS;
|
||||||
|
}
|
||||||
|
}
|
50
opm/simulators/linalg/bda/opencl/kernels/residual.cl
Normal file
50
opm/simulators/linalg/bda/opencl/kernels/residual.cl
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
/// res = rhs - mat * x
|
||||||
|
/// algorithm based on:
|
||||||
|
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||||
|
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||||
|
__kernel void residual(
|
||||||
|
__global const double *vals,
|
||||||
|
__global const int *cols,
|
||||||
|
__global const int *rows,
|
||||||
|
const int N,
|
||||||
|
__global const double *x,
|
||||||
|
__global const double *rhs,
|
||||||
|
__global double *out,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int bsize = get_local_size(0);
|
||||||
|
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
const unsigned int num_workgroups = get_num_groups(0);
|
||||||
|
|
||||||
|
int row = idx_b;
|
||||||
|
|
||||||
|
while (row < N) {
|
||||||
|
int rowStart = rows[row];
|
||||||
|
int rowEnd = rows[row+1];
|
||||||
|
int rowLength = rowEnd - rowStart;
|
||||||
|
double local_sum = 0.0;
|
||||||
|
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||||
|
int col = cols[j];
|
||||||
|
local_sum += vals[j] * x[col];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp[idx_t] = local_sum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int offset = bsize / 2;
|
||||||
|
while(offset > 0) {
|
||||||
|
if (idx_t < offset) {
|
||||||
|
tmp[idx_t] += tmp[idx_t + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
offset = offset / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx_t == 0) {
|
||||||
|
out[row] = rhs[row] - tmp[idx_t];
|
||||||
|
}
|
||||||
|
|
||||||
|
row += num_workgroups;
|
||||||
|
}
|
||||||
|
}
|
68
opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl
Normal file
68
opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
/// res = rhs - mat * x
|
||||||
|
/// algorithm based on:
|
||||||
|
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||||
|
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||||
|
__kernel void residual_blocked(
|
||||||
|
__global const double *vals,
|
||||||
|
__global const int *cols,
|
||||||
|
__global const int *rows,
|
||||||
|
const int Nb,
|
||||||
|
__global const double *x,
|
||||||
|
__global const double *rhs,
|
||||||
|
__global double *out,
|
||||||
|
const unsigned int block_size,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int warpsize = 32;
|
||||||
|
const unsigned int bsize = get_local_size(0);
|
||||||
|
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
unsigned int idx = idx_b * bsize + idx_t;
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||||
|
unsigned int target_block_row = idx / warpsize;
|
||||||
|
const unsigned int lane = idx_t % warpsize;
|
||||||
|
const unsigned int c = (lane / bs) % bs;
|
||||||
|
const unsigned int r = lane % bs;
|
||||||
|
|
||||||
|
// for 3x3 blocks:
|
||||||
|
// num_active_threads: 27
|
||||||
|
// num_blocks_per_warp: 3
|
||||||
|
|
||||||
|
while(target_block_row < Nb){
|
||||||
|
unsigned int first_block = rows[target_block_row];
|
||||||
|
unsigned int last_block = rows[target_block_row+1];
|
||||||
|
unsigned int block = first_block + lane / (bs*bs);
|
||||||
|
double local_out = 0.0;
|
||||||
|
|
||||||
|
if(lane < num_active_threads){
|
||||||
|
for(; block < last_block; block += num_blocks_per_warp){
|
||||||
|
double x_elem = x[cols[block]*bs + c];
|
||||||
|
double A_elem = vals[block*bs*bs + c + r*bs];
|
||||||
|
local_out += x_elem * A_elem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
tmp[lane] = local_out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||||
|
{
|
||||||
|
if (lane + offset < warpsize)
|
||||||
|
{
|
||||||
|
tmp[lane] += tmp[lane + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(lane < bs){
|
||||||
|
unsigned int row = target_block_row*bs + lane;
|
||||||
|
out[row] = rhs[row] - tmp[lane];
|
||||||
|
}
|
||||||
|
target_block_row += num_warps_in_grid;
|
||||||
|
}
|
||||||
|
}
|
14
opm/simulators/linalg/bda/opencl/kernels/scale.cl
Normal file
14
opm/simulators/linalg/bda/opencl/kernels/scale.cl
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
/// scale vector with scalar: a = a * alpha
|
||||||
|
__kernel void scale(
|
||||||
|
__global double *vec,
|
||||||
|
const double a,
|
||||||
|
const int N)
|
||||||
|
{
|
||||||
|
unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
int idx = get_global_id(0);
|
||||||
|
|
||||||
|
while(idx < N){
|
||||||
|
vec[idx] *= a;
|
||||||
|
idx += NUM_THREADS;
|
||||||
|
}
|
||||||
|
}
|
49
opm/simulators/linalg/bda/opencl/kernels/spmv.cl
Normal file
49
opm/simulators/linalg/bda/opencl/kernels/spmv.cl
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
/// b = mat * x
|
||||||
|
/// algorithm based on:
|
||||||
|
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||||
|
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||||
|
__kernel void spmv(
|
||||||
|
__global const double *vals,
|
||||||
|
__global const int *cols,
|
||||||
|
__global const int *rows,
|
||||||
|
const int N,
|
||||||
|
__global const double *x,
|
||||||
|
__global double *out,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int bsize = get_local_size(0);
|
||||||
|
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
const unsigned int num_workgroups = get_num_groups(0);
|
||||||
|
|
||||||
|
int row = idx_b;
|
||||||
|
|
||||||
|
while (row < N) {
|
||||||
|
int rowStart = rows[row];
|
||||||
|
int rowEnd = rows[row+1];
|
||||||
|
int rowLength = rowEnd - rowStart;
|
||||||
|
double local_sum = 0.0;
|
||||||
|
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||||
|
int col = cols[j];
|
||||||
|
local_sum += vals[j] * x[col];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp[idx_t] = local_sum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int offset = bsize / 2;
|
||||||
|
while(offset > 0) {
|
||||||
|
if (idx_t < offset) {
|
||||||
|
tmp[idx_t] += tmp[idx_t + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
offset = offset / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx_t == 0) {
|
||||||
|
out[row] = tmp[idx_t];
|
||||||
|
}
|
||||||
|
|
||||||
|
row += num_workgroups;
|
||||||
|
}
|
||||||
|
}
|
67
opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl
Normal file
67
opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
/// b = mat * x
|
||||||
|
/// algorithm based on:
|
||||||
|
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||||
|
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||||
|
__kernel void spmv_blocked(
|
||||||
|
__global const double *vals,
|
||||||
|
__global const int *cols,
|
||||||
|
__global const int *rows,
|
||||||
|
const int Nb,
|
||||||
|
__global const double *x,
|
||||||
|
__global double *out,
|
||||||
|
const unsigned int block_size,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int warpsize = 32;
|
||||||
|
const unsigned int bsize = get_local_size(0);
|
||||||
|
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
unsigned int idx = idx_b * bsize + idx_t;
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||||
|
const unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||||
|
unsigned int target_block_row = idx / warpsize;
|
||||||
|
const unsigned int lane = idx_t % warpsize;
|
||||||
|
const unsigned int c = (lane / bs) % bs;
|
||||||
|
const unsigned int r = lane % bs;
|
||||||
|
|
||||||
|
// for 3x3 blocks:
|
||||||
|
// num_active_threads: 27
|
||||||
|
// num_blocks_per_warp: 3
|
||||||
|
|
||||||
|
while(target_block_row < Nb){
|
||||||
|
unsigned int first_block = rows[target_block_row];
|
||||||
|
unsigned int last_block = rows[target_block_row+1];
|
||||||
|
unsigned int block = first_block + lane / (bs*bs);
|
||||||
|
double local_out = 0.0;
|
||||||
|
|
||||||
|
if(lane < num_active_threads){
|
||||||
|
for(; block < last_block; block += num_blocks_per_warp){
|
||||||
|
double x_elem = x[cols[block]*bs + c];
|
||||||
|
double A_elem = vals[block*bs*bs + c + r*bs];
|
||||||
|
local_out += x_elem * A_elem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
tmp[lane] = local_out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||||
|
{
|
||||||
|
if (lane + offset < warpsize)
|
||||||
|
{
|
||||||
|
tmp[lane] += tmp[lane + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(lane < bs){
|
||||||
|
unsigned int row = target_block_row*bs + lane;
|
||||||
|
out[row] = tmp[lane];
|
||||||
|
}
|
||||||
|
target_block_row += num_warps_in_grid;
|
||||||
|
}
|
||||||
|
}
|
48
opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl
Normal file
48
opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
/// algorithm based on:
|
||||||
|
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||||
|
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||||
|
__kernel void spmv_noreset(
|
||||||
|
__global const double *vals,
|
||||||
|
__global const int *cols,
|
||||||
|
__global const int *rows,
|
||||||
|
const int N,
|
||||||
|
__global const double *x,
|
||||||
|
__global double *out,
|
||||||
|
__local double *tmp)
|
||||||
|
{
|
||||||
|
const unsigned int bsize = get_local_size(0);
|
||||||
|
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||||
|
const unsigned int idx_t = get_local_id(0);
|
||||||
|
const unsigned int num_workgroups = get_num_groups(0);
|
||||||
|
|
||||||
|
int row = idx_b;
|
||||||
|
|
||||||
|
while (row < N) {
|
||||||
|
int rowStart = rows[row];
|
||||||
|
int rowEnd = rows[row+1];
|
||||||
|
int rowLength = rowEnd - rowStart;
|
||||||
|
double local_sum = 0.0;
|
||||||
|
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||||
|
int col = cols[j];
|
||||||
|
local_sum += vals[j] * x[col];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp[idx_t] = local_sum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int offset = bsize / 2;
|
||||||
|
while(offset > 0) {
|
||||||
|
if (idx_t < offset) {
|
||||||
|
tmp[idx_t] += tmp[idx_t + offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
offset = offset / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx_t == 0) {
|
||||||
|
out[row] += tmp[idx_t];
|
||||||
|
}
|
||||||
|
|
||||||
|
row += num_workgroups;
|
||||||
|
}
|
||||||
|
}
|
76
opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl
Normal file
76
opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
/// In this kernel there is reordering: the B/Ccols do not correspond with the x/y vector
|
||||||
|
/// the x/y vector is reordered, using toOrder to address that
|
||||||
|
__kernel void stdwell_apply(
|
||||||
|
__global const double *Cnnzs,
|
||||||
|
__global const double *Dnnzs,
|
||||||
|
__global const double *Bnnzs,
|
||||||
|
__global const int *Ccols,
|
||||||
|
__global const int *Bcols,
|
||||||
|
__global const double *x,
|
||||||
|
__global double *y,
|
||||||
|
__global const int *toOrder,
|
||||||
|
const unsigned int dim,
|
||||||
|
const unsigned int dim_wells,
|
||||||
|
__global const unsigned int *val_pointers,
|
||||||
|
__local double *localSum,
|
||||||
|
__local double *z1,
|
||||||
|
__local double *z2)
|
||||||
|
{
|
||||||
|
int wgId = get_group_id(0);
|
||||||
|
int wiId = get_local_id(0);
|
||||||
|
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
||||||
|
int valsPerBlock = dim*dim_wells;
|
||||||
|
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
||||||
|
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
||||||
|
int c = wiId % dim;
|
||||||
|
int r = (wiId/dim) % dim_wells;
|
||||||
|
double temp;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
localSum[wiId] = 0;
|
||||||
|
if(wiId < numActiveWorkItems){
|
||||||
|
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||||
|
while(b < valSize + val_pointers[wgId]){
|
||||||
|
int colIdx = toOrder[Bcols[b]];
|
||||||
|
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||||
|
b += numBlocksPerWarp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(wiId < valsPerBlock){
|
||||||
|
localSum[wiId] += localSum[wiId + valsPerBlock];
|
||||||
|
}
|
||||||
|
|
||||||
|
b = wiId/valsPerBlock + val_pointers[wgId];
|
||||||
|
|
||||||
|
if(c == 0 && wiId < valsPerBlock){
|
||||||
|
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
||||||
|
localSum[wiId] += localSum[wiId + stride];
|
||||||
|
}
|
||||||
|
z1[r] = localSum[wiId];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < dim_wells){
|
||||||
|
temp = 0.0;
|
||||||
|
for(unsigned int i = 0; i < dim_wells; ++i){
|
||||||
|
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||||
|
}
|
||||||
|
z2[wiId] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < dim*valSize){
|
||||||
|
temp = 0.0;
|
||||||
|
int bb = wiId/dim + val_pointers[wgId];
|
||||||
|
for (unsigned int j = 0; j < dim_wells; ++j){
|
||||||
|
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
int colIdx = toOrder[Ccols[bb]];
|
||||||
|
y[colIdx*dim + c] -= temp;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,74 @@
|
|||||||
|
/// Applies sdtwells without reordering
|
||||||
|
__kernel void stdwell_apply_no_reorder(
|
||||||
|
__global const double *Cnnzs,
|
||||||
|
__global const double *Dnnzs,
|
||||||
|
__global const double *Bnnzs,
|
||||||
|
__global const int *Ccols,
|
||||||
|
__global const int *Bcols,
|
||||||
|
__global const double *x,
|
||||||
|
__global double *y,
|
||||||
|
const unsigned int dim,
|
||||||
|
const unsigned int dim_wells,
|
||||||
|
__global const unsigned int *val_pointers,
|
||||||
|
__local double *localSum,
|
||||||
|
__local double *z1,
|
||||||
|
__local double *z2)
|
||||||
|
{
|
||||||
|
int wgId = get_group_id(0);
|
||||||
|
int wiId = get_local_id(0);
|
||||||
|
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
||||||
|
int valsPerBlock = dim*dim_wells;
|
||||||
|
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
||||||
|
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
||||||
|
int c = wiId % dim;
|
||||||
|
int r = (wiId/dim) % dim_wells;
|
||||||
|
double temp;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
localSum[wiId] = 0;
|
||||||
|
if(wiId < numActiveWorkItems){
|
||||||
|
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||||
|
while(b < valSize + val_pointers[wgId]){
|
||||||
|
int colIdx = Bcols[b];
|
||||||
|
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||||
|
b += numBlocksPerWarp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(wiId < valsPerBlock){
|
||||||
|
localSum[wiId] += localSum[wiId + valsPerBlock];
|
||||||
|
}
|
||||||
|
|
||||||
|
b = wiId/valsPerBlock + val_pointers[wgId];
|
||||||
|
|
||||||
|
if(c == 0 && wiId < valsPerBlock){
|
||||||
|
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
||||||
|
localSum[wiId] += localSum[wiId + stride];
|
||||||
|
}
|
||||||
|
z1[r] = localSum[wiId];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < dim_wells){
|
||||||
|
temp = 0.0;
|
||||||
|
for(unsigned int i = 0; i < dim_wells; ++i){
|
||||||
|
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||||
|
}
|
||||||
|
z2[wiId] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if(wiId < dim*valSize){
|
||||||
|
temp = 0.0;
|
||||||
|
int bb = wiId/dim + val_pointers[wgId];
|
||||||
|
for (unsigned int j = 0; j < dim_wells; ++j){
|
||||||
|
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
int colIdx = Ccols[bb];
|
||||||
|
y[colIdx*dim + c] -= temp;
|
||||||
|
}
|
||||||
|
}
|
17
opm/simulators/linalg/bda/opencl/kernels/vmul.cl
Normal file
17
opm/simulators/linalg/bda/opencl/kernels/vmul.cl
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/// multiply vector with another vector and a scalar, element-wise
|
||||||
|
/// add result to a third vector
|
||||||
|
__kernel void vmul(
|
||||||
|
const double alpha,
|
||||||
|
__global double const *in1,
|
||||||
|
__global double const *in2,
|
||||||
|
__global double *out,
|
||||||
|
const int N)
|
||||||
|
{
|
||||||
|
unsigned int NUM_THREADS = get_global_size(0);
|
||||||
|
int idx = get_global_id(0);
|
||||||
|
|
||||||
|
while(idx < N){
|
||||||
|
out[idx] += alpha * in1[idx] * in2[idx];
|
||||||
|
idx += NUM_THREADS;
|
||||||
|
}
|
||||||
|
}
|
@ -69,12 +69,8 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
|||||||
return A / B + (A % B > 0);
|
return A / B + (A % B > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_kernel_source(cl::Program::Sources &sources, const std::string &source) {
|
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_)
|
||||||
sources.emplace_back(source);
|
{
|
||||||
}
|
|
||||||
|
|
||||||
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_) {
|
|
||||||
|
|
||||||
if (initialized) {
|
if (initialized) {
|
||||||
OpmLog::debug("Warning OpenclKernels is already initialized");
|
OpmLog::debug("Warning OpenclKernels is already initialized");
|
||||||
return;
|
return;
|
||||||
@ -84,49 +80,30 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
|||||||
verbosity = verbosity_;
|
verbosity = verbosity_;
|
||||||
|
|
||||||
cl::Program::Sources sources;
|
cl::Program::Sources sources;
|
||||||
const std::string& axpy_s = get_axpy_source();
|
sources.emplace_back(axpy_str);
|
||||||
add_kernel_source(sources, axpy_s);
|
sources.emplace_back(scale_str);
|
||||||
const std::string& scale_s = get_scale_source();
|
sources.emplace_back(vmul_str);
|
||||||
add_kernel_source(sources, scale_s);
|
sources.emplace_back(dot_1_str);
|
||||||
const std::string& vmul_s = get_vmul_source();
|
sources.emplace_back(norm_str);
|
||||||
add_kernel_source(sources, vmul_s);
|
sources.emplace_back(custom_str);
|
||||||
const std::string& dot_1_s = get_dot_1_source();
|
sources.emplace_back(full_to_pressure_restriction_str);
|
||||||
add_kernel_source(sources, dot_1_s);
|
sources.emplace_back(add_coarse_pressure_correction_str);
|
||||||
const std::string& norm_s = get_norm_source();
|
sources.emplace_back(prolongate_vector_str);
|
||||||
add_kernel_source(sources, norm_s);
|
sources.emplace_back(spmv_blocked_str);
|
||||||
const std::string& custom_s = get_custom_source();
|
sources.emplace_back(spmv_str);
|
||||||
add_kernel_source(sources, custom_s);
|
sources.emplace_back(spmv_noreset_str);
|
||||||
const std::string& full_to_pressure_restriction_s = get_full_to_pressure_restriction_source();
|
sources.emplace_back(residual_blocked_str);
|
||||||
add_kernel_source(sources, full_to_pressure_restriction_s);
|
sources.emplace_back(residual_str);
|
||||||
const std::string& add_coarse_pressure_correction_s = get_add_coarse_pressure_correction_source();
|
|
||||||
add_kernel_source(sources, add_coarse_pressure_correction_s);
|
|
||||||
const std::string& prolongate_vector_s = get_prolongate_vector_source();
|
|
||||||
add_kernel_source(sources, prolongate_vector_s);
|
|
||||||
const std::string& spmv_blocked_s = get_blocked_matrix_operation_source(matrix_operation::spmv_op);
|
|
||||||
add_kernel_source(sources, spmv_blocked_s);
|
|
||||||
const std::string& spmv_s = get_matrix_operation_source(matrix_operation::spmv_op, true);
|
|
||||||
add_kernel_source(sources, spmv_s);
|
|
||||||
const std::string& spmv_noreset_s = get_matrix_operation_source(matrix_operation::spmv_op, false);
|
|
||||||
add_kernel_source(sources, spmv_noreset_s);
|
|
||||||
const std::string& residual_blocked_s = get_blocked_matrix_operation_source(matrix_operation::residual_op);
|
|
||||||
add_kernel_source(sources, residual_blocked_s);
|
|
||||||
const std::string& residual_s = get_matrix_operation_source(matrix_operation::residual_op);
|
|
||||||
add_kernel_source(sources, residual_s);
|
|
||||||
#if CHOW_PATEL
|
#if CHOW_PATEL
|
||||||
bool ilu_operate_on_full_matrix = false;
|
sources.emplace_back(ILU_apply1_str);
|
||||||
|
sources.emplace_back(ILU_apply2_str);
|
||||||
#else
|
#else
|
||||||
bool ilu_operate_on_full_matrix = true;
|
sources.emplace_back(ILU_apply1_fm_str);
|
||||||
|
sources.emplace_back(ILU_apply2_fm_str);
|
||||||
#endif
|
#endif
|
||||||
const std::string& ILU_apply1_s = get_ILU_apply1_source(ilu_operate_on_full_matrix);
|
sources.emplace_back(stdwell_apply_str);
|
||||||
add_kernel_source(sources, ILU_apply1_s);
|
sources.emplace_back(stdwell_apply_no_reorder_str);
|
||||||
const std::string& ILU_apply2_s = get_ILU_apply2_source(ilu_operate_on_full_matrix);
|
sources.emplace_back(ILU_decomp_str);
|
||||||
add_kernel_source(sources, ILU_apply2_s);
|
|
||||||
const std::string& stdwell_apply_s = get_stdwell_apply_source(true);
|
|
||||||
add_kernel_source(sources, stdwell_apply_s);
|
|
||||||
const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_source(false);
|
|
||||||
add_kernel_source(sources, stdwell_apply_no_reorder_s);
|
|
||||||
const std::string& ilu_decomp_s = get_ilu_decomp_source();
|
|
||||||
add_kernel_source(sources, ilu_decomp_s);
|
|
||||||
|
|
||||||
cl::Program program = cl::Program(*context, sources);
|
cl::Program program = cl::Program(*context, sources);
|
||||||
program.build(devices);
|
program.build(devices);
|
||||||
@ -158,9 +135,6 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
|||||||
initialized = true;
|
initialized = true;
|
||||||
} // end get_opencl_kernels()
|
} // end get_opencl_kernels()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size = 256;
|
const unsigned int work_group_size = 256;
|
||||||
@ -387,7 +361,6 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size = 32;
|
const unsigned int work_group_size = 32;
|
||||||
@ -406,7 +379,6 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& r
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size = 32;
|
const unsigned int work_group_size = 32;
|
||||||
@ -488,808 +460,5 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::string OpenclKernels::get_axpy_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void axpy(
|
|
||||||
__global double *in,
|
|
||||||
const double a,
|
|
||||||
__global double *out,
|
|
||||||
const int N)
|
|
||||||
{
|
|
||||||
unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
int idx = get_global_id(0);
|
|
||||||
|
|
||||||
while(idx < N){
|
|
||||||
out[idx] += a * in[idx];
|
|
||||||
idx += NUM_THREADS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// scale vector with scalar
|
|
||||||
std::string OpenclKernels::get_scale_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void scale(
|
|
||||||
__global double *vec,
|
|
||||||
const double a,
|
|
||||||
const int N)
|
|
||||||
{
|
|
||||||
unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
int idx = get_global_id(0);
|
|
||||||
|
|
||||||
while(idx < N){
|
|
||||||
vec[idx] *= a;
|
|
||||||
idx += NUM_THREADS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
// multiply vector with another vector and a scalar, element-wise
|
|
||||||
// add result to a third vector
|
|
||||||
std::string OpenclKernels::get_vmul_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void vmul(
|
|
||||||
const double alpha,
|
|
||||||
__global double const *in1,
|
|
||||||
__global double const *in2,
|
|
||||||
__global double *out,
|
|
||||||
const int N)
|
|
||||||
{
|
|
||||||
unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
int idx = get_global_id(0);
|
|
||||||
|
|
||||||
while(idx < N){
|
|
||||||
out[idx] += alpha * in1[idx] * in2[idx];
|
|
||||||
idx += NUM_THREADS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// returns partial sums, instead of the final dot product
|
|
||||||
std::string OpenclKernels::get_dot_1_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void dot_1(
|
|
||||||
__global double *in1,
|
|
||||||
__global double *in2,
|
|
||||||
__global double *out,
|
|
||||||
const unsigned int N,
|
|
||||||
__local double *tmp)
|
|
||||||
{
|
|
||||||
unsigned int tid = get_local_id(0);
|
|
||||||
unsigned int bsize = get_local_size(0);
|
|
||||||
unsigned int bid = get_global_id(0) / bsize;
|
|
||||||
unsigned int i = get_global_id(0);
|
|
||||||
unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
|
|
||||||
double sum = 0.0;
|
|
||||||
while(i < N){
|
|
||||||
sum += in1[i] * in2[i];
|
|
||||||
i += NUM_THREADS;
|
|
||||||
}
|
|
||||||
tmp[tid] = sum;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
// do reduction in shared mem
|
|
||||||
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
|
||||||
{
|
|
||||||
if (tid < s)
|
|
||||||
{
|
|
||||||
tmp[tid] += tmp[tid + s];
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// write result for this block to global mem
|
|
||||||
if (tid == 0) out[get_group_id(0)] = tmp[0];
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// returns partial sums, instead of the final norm
|
|
||||||
// the square root must be computed on CPU
|
|
||||||
std::string OpenclKernels::get_norm_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void norm(
|
|
||||||
__global double *in,
|
|
||||||
__global double *out,
|
|
||||||
const unsigned int N,
|
|
||||||
__local double *tmp)
|
|
||||||
{
|
|
||||||
unsigned int tid = get_local_id(0);
|
|
||||||
unsigned int bsize = get_local_size(0);
|
|
||||||
unsigned int bid = get_global_id(0) / bsize;
|
|
||||||
unsigned int i = get_global_id(0);
|
|
||||||
unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
|
|
||||||
double local_sum = 0.0;
|
|
||||||
while(i < N){
|
|
||||||
local_sum += in[i] * in[i];
|
|
||||||
i += NUM_THREADS;
|
|
||||||
}
|
|
||||||
tmp[tid] = local_sum;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
// do reduction in shared mem
|
|
||||||
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
|
||||||
{
|
|
||||||
if (tid < s)
|
|
||||||
{
|
|
||||||
tmp[tid] += tmp[tid + s];
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// write result for this block to global mem
|
|
||||||
if (tid == 0) out[get_group_id(0)] = tmp[0];
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// p = (p - omega * v) * beta + r
|
|
||||||
std::string OpenclKernels::get_custom_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void custom(
|
|
||||||
__global double *p,
|
|
||||||
__global double *v,
|
|
||||||
__global double *r,
|
|
||||||
const double omega,
|
|
||||||
const double beta,
|
|
||||||
const int N)
|
|
||||||
{
|
|
||||||
const unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
unsigned int idx = get_global_id(0);
|
|
||||||
|
|
||||||
while(idx < N){
|
|
||||||
double res = p[idx];
|
|
||||||
res -= omega * v[idx];
|
|
||||||
res *= beta;
|
|
||||||
res += r[idx];
|
|
||||||
p[idx] = res;
|
|
||||||
idx += NUM_THREADS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// transform blocked vector to scalar vector using pressure-weights
|
|
||||||
// every workitem handles one blockrow
|
|
||||||
std::string OpenclKernels::get_full_to_pressure_restriction_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void full_to_pressure_restriction(
|
|
||||||
__global const double *fine_y,
|
|
||||||
__global const double *weights,
|
|
||||||
__global double *coarse_y,
|
|
||||||
const unsigned int Nb)
|
|
||||||
{
|
|
||||||
const unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
const unsigned int block_size = 3;
|
|
||||||
unsigned int target_block_row = get_global_id(0);
|
|
||||||
|
|
||||||
while(target_block_row < Nb){
|
|
||||||
double sum = 0.0;
|
|
||||||
unsigned int idx = block_size * target_block_row;
|
|
||||||
for (unsigned int i = 0; i < block_size; ++i) {
|
|
||||||
sum += fine_y[idx + i] * weights[idx + i];
|
|
||||||
}
|
|
||||||
coarse_y[target_block_row] = sum;
|
|
||||||
target_block_row += NUM_THREADS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
// add the coarse pressure solution back to the finer, complete solution
|
|
||||||
// every workitem handles one blockrow
|
|
||||||
std::string OpenclKernels::get_add_coarse_pressure_correction_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void add_coarse_pressure_correction(
|
|
||||||
__global const double *coarse_x,
|
|
||||||
__global double *fine_x,
|
|
||||||
const unsigned int pressure_idx,
|
|
||||||
const unsigned int Nb)
|
|
||||||
{
|
|
||||||
const unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
const unsigned int block_size = 3;
|
|
||||||
unsigned int target_block_row = get_global_id(0);
|
|
||||||
|
|
||||||
while(target_block_row < Nb){
|
|
||||||
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
|
|
||||||
target_block_row += NUM_THREADS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
// prolongate vector during amg cycle
|
|
||||||
// every workitem handles one row
|
|
||||||
std::string OpenclKernels::get_prolongate_vector_source() {
|
|
||||||
return R"(
|
|
||||||
__kernel void prolongate_vector(
|
|
||||||
__global const double *in,
|
|
||||||
__global double *out,
|
|
||||||
__global const int *cols,
|
|
||||||
const unsigned int N)
|
|
||||||
{
|
|
||||||
const unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
unsigned int row = get_global_id(0);
|
|
||||||
|
|
||||||
while(row < N){
|
|
||||||
out[row] += in[cols[row]];
|
|
||||||
row += NUM_THREADS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// either b = mat * x
|
|
||||||
/// or res = rhs - mat * x
|
|
||||||
std::string OpenclKernels::get_blocked_matrix_operation_source(matrix_operation op) {
|
|
||||||
std::string s;
|
|
||||||
if (op == matrix_operation::spmv_op) {
|
|
||||||
s += "__kernel void spmv_blocked(";
|
|
||||||
} else {
|
|
||||||
s += "__kernel void residual_blocked(";
|
|
||||||
}
|
|
||||||
s += R"(__global const double *vals,
|
|
||||||
__global const int *cols,
|
|
||||||
__global const int *rows,
|
|
||||||
const int Nb,
|
|
||||||
__global const double *x,
|
|
||||||
)";
|
|
||||||
if (op == matrix_operation::residual_op) {
|
|
||||||
s += "__global const double *rhs,";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
__global double *out,
|
|
||||||
const unsigned int block_size,
|
|
||||||
__local double *tmp)
|
|
||||||
{
|
|
||||||
const unsigned int warpsize = 32;
|
|
||||||
const unsigned int bsize = get_local_size(0);
|
|
||||||
const unsigned int idx_b = get_global_id(0) / bsize;
|
|
||||||
const unsigned int idx_t = get_local_id(0);
|
|
||||||
unsigned int idx = idx_b * bsize + idx_t;
|
|
||||||
const unsigned int bs = block_size;
|
|
||||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
|
||||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
|
||||||
const unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
|
||||||
unsigned int target_block_row = idx / warpsize;
|
|
||||||
const unsigned int lane = idx_t % warpsize;
|
|
||||||
const unsigned int c = (lane / bs) % bs;
|
|
||||||
const unsigned int r = lane % bs;
|
|
||||||
|
|
||||||
// for 3x3 blocks:
|
|
||||||
// num_active_threads: 27
|
|
||||||
// num_blocks_per_warp: 3
|
|
||||||
|
|
||||||
while(target_block_row < Nb){
|
|
||||||
unsigned int first_block = rows[target_block_row];
|
|
||||||
unsigned int last_block = rows[target_block_row+1];
|
|
||||||
unsigned int block = first_block + lane / (bs*bs);
|
|
||||||
double local_out = 0.0;
|
|
||||||
|
|
||||||
if(lane < num_active_threads){
|
|
||||||
for(; block < last_block; block += num_blocks_per_warp){
|
|
||||||
double x_elem = x[cols[block]*bs + c];
|
|
||||||
double A_elem = vals[block*bs*bs + c + r*bs];
|
|
||||||
local_out += x_elem * A_elem;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// do reduction in shared mem
|
|
||||||
tmp[lane] = local_out;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
|
||||||
{
|
|
||||||
if (lane + offset < warpsize)
|
|
||||||
{
|
|
||||||
tmp[lane] += tmp[lane + offset];
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(lane < bs){
|
|
||||||
unsigned int row = target_block_row*bs + lane;
|
|
||||||
)";
|
|
||||||
if (op == matrix_operation::spmv_op) {
|
|
||||||
s += " out[row] = tmp[lane];";
|
|
||||||
} else {
|
|
||||||
s += " out[row] = rhs[row] - tmp[lane];";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
}
|
|
||||||
target_block_row += num_warps_in_grid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// either b = mat * x
|
|
||||||
/// or res = rhs - mat * x
|
|
||||||
std::string OpenclKernels::get_matrix_operation_source(matrix_operation op, bool spmv_reset) {
|
|
||||||
std::string s;
|
|
||||||
if (op == matrix_operation::spmv_op) {
|
|
||||||
if (spmv_reset) {
|
|
||||||
s += "__kernel void spmv(";
|
|
||||||
} else {
|
|
||||||
s += "__kernel void spmv_noreset(";
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s += "__kernel void residual(";
|
|
||||||
}
|
|
||||||
s += R"(__global const double *vals,
|
|
||||||
__global const int *cols,
|
|
||||||
__global const int *rows,
|
|
||||||
const int N,
|
|
||||||
__global const double *x,
|
|
||||||
)";
|
|
||||||
if (op == matrix_operation::residual_op) {
|
|
||||||
s += "__global const double *rhs,";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
__global double *out,
|
|
||||||
__local double *tmp)
|
|
||||||
{
|
|
||||||
const unsigned int bsize = get_local_size(0);
|
|
||||||
const unsigned int idx_b = get_global_id(0) / bsize;
|
|
||||||
const unsigned int idx_t = get_local_id(0);
|
|
||||||
const unsigned int num_workgroups = get_num_groups(0);
|
|
||||||
|
|
||||||
int row = idx_b;
|
|
||||||
|
|
||||||
while (row < N) {
|
|
||||||
int rowStart = rows[row];
|
|
||||||
int rowEnd = rows[row+1];
|
|
||||||
int rowLength = rowEnd - rowStart;
|
|
||||||
double local_sum = 0.0;
|
|
||||||
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
|
||||||
int col = cols[j];
|
|
||||||
local_sum += vals[j] * x[col];
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp[idx_t] = local_sum;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
int offset = bsize / 2;
|
|
||||||
while(offset > 0) {
|
|
||||||
if (idx_t < offset) {
|
|
||||||
tmp[idx_t] += tmp[idx_t + offset];
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
offset = offset / 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (idx_t == 0) {
|
|
||||||
)";
|
|
||||||
if (op == matrix_operation::spmv_op) {
|
|
||||||
if (spmv_reset) {
|
|
||||||
s += " out[row] = tmp[idx_t];";
|
|
||||||
} else {
|
|
||||||
s += " out[row] += tmp[idx_t];";
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s += " out[row] = rhs[row] - tmp[idx_t];";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
}
|
|
||||||
row += num_workgroups;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::string OpenclKernels::get_ILU_apply1_source(bool full_matrix) {
|
|
||||||
std::string s = R"(
|
|
||||||
__kernel void ILU_apply1(
|
|
||||||
__global const double *LUvals,
|
|
||||||
__global const unsigned int *LUcols,
|
|
||||||
__global const unsigned int *LUrows,
|
|
||||||
__global const int *diagIndex,
|
|
||||||
__global const double *y,
|
|
||||||
__global double *x,
|
|
||||||
__global const unsigned int *nodesPerColorPrefix,
|
|
||||||
const unsigned int color,
|
|
||||||
const unsigned int block_size,
|
|
||||||
__local double *tmp)
|
|
||||||
{
|
|
||||||
const unsigned int warpsize = 32;
|
|
||||||
const unsigned int bs = block_size;
|
|
||||||
const unsigned int idx_t = get_local_id(0);
|
|
||||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
|
||||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
|
||||||
const unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
|
||||||
unsigned int idx = get_global_id(0);
|
|
||||||
unsigned int target_block_row = idx / warpsize;
|
|
||||||
target_block_row += nodesPerColorPrefix[color];
|
|
||||||
const unsigned int lane = idx_t % warpsize;
|
|
||||||
const unsigned int c = (lane / bs) % bs;
|
|
||||||
const unsigned int r = lane % bs;
|
|
||||||
|
|
||||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
|
||||||
const unsigned int first_block = LUrows[target_block_row];
|
|
||||||
)";
|
|
||||||
if (full_matrix) {
|
|
||||||
s += "const unsigned int last_block = diagIndex[target_block_row]; ";
|
|
||||||
} else {
|
|
||||||
s += "const unsigned int last_block = LUrows[target_block_row+1]; ";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
unsigned int block = first_block + lane / (bs*bs);
|
|
||||||
double local_out = 0.0;
|
|
||||||
if(lane < num_active_threads){
|
|
||||||
if(lane < bs){
|
|
||||||
local_out = y[target_block_row*bs+lane];
|
|
||||||
}
|
|
||||||
for(; block < last_block; block += num_blocks_per_warp){
|
|
||||||
const double x_elem = x[LUcols[block]*bs + c];
|
|
||||||
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
|
||||||
local_out -= x_elem * A_elem;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// do reduction in shared mem
|
|
||||||
tmp[lane] = local_out;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
|
||||||
{
|
|
||||||
if (lane + offset < warpsize)
|
|
||||||
{
|
|
||||||
tmp[lane] += tmp[lane + offset];
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(lane < bs){
|
|
||||||
const unsigned int row = target_block_row*bs + lane;
|
|
||||||
x[row] = tmp[lane];
|
|
||||||
}
|
|
||||||
|
|
||||||
target_block_row += num_warps_in_grid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::string OpenclKernels::get_ILU_apply2_source(bool full_matrix) {
|
|
||||||
std::string s = R"(
|
|
||||||
__kernel void ILU_apply2(
|
|
||||||
__global const double *LUvals,
|
|
||||||
__global const int *LUcols,
|
|
||||||
__global const int *LUrows,
|
|
||||||
__global const int *diagIndex,
|
|
||||||
__global const double *invDiagVals,
|
|
||||||
__global double *x,
|
|
||||||
__global const unsigned int *nodesPerColorPrefix,
|
|
||||||
const unsigned int color,
|
|
||||||
const unsigned int block_size,
|
|
||||||
__local double *tmp)
|
|
||||||
{
|
|
||||||
const unsigned int warpsize = 32;
|
|
||||||
const unsigned int bs = block_size;
|
|
||||||
const unsigned int idx_t = get_local_id(0);
|
|
||||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
|
||||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
|
||||||
const unsigned int NUM_THREADS = get_global_size(0);
|
|
||||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
|
||||||
unsigned int idx_g = get_global_id(0);
|
|
||||||
unsigned int target_block_row = idx_g / warpsize;
|
|
||||||
target_block_row += nodesPerColorPrefix[color];
|
|
||||||
const unsigned int lane = idx_t % warpsize;
|
|
||||||
const unsigned int c = (lane / bs) % bs;
|
|
||||||
const unsigned int r = lane % bs;
|
|
||||||
|
|
||||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
|
||||||
)";
|
|
||||||
if (full_matrix) {
|
|
||||||
s += "const unsigned int first_block = diagIndex[target_block_row] + 1; ";
|
|
||||||
} else {
|
|
||||||
s += "const unsigned int first_block = LUrows[target_block_row]; ";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
const unsigned int last_block = LUrows[target_block_row+1];
|
|
||||||
unsigned int block = first_block + lane / (bs*bs);
|
|
||||||
double local_out = 0.0;
|
|
||||||
if(lane < num_active_threads){
|
|
||||||
if(lane < bs){
|
|
||||||
const unsigned int row = target_block_row*bs+lane;
|
|
||||||
local_out = x[row];
|
|
||||||
}
|
|
||||||
for(; block < last_block; block += num_blocks_per_warp){
|
|
||||||
const double x_elem = x[LUcols[block]*bs + c];
|
|
||||||
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
|
||||||
local_out -= x_elem * A_elem;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// do reduction in shared mem
|
|
||||||
tmp[lane] = local_out;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
|
||||||
{
|
|
||||||
if (lane + offset < warpsize)
|
|
||||||
{
|
|
||||||
tmp[lane] += tmp[lane + offset];
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
local_out = tmp[lane];
|
|
||||||
|
|
||||||
if(lane < bs){
|
|
||||||
tmp[lane + bs*idx_t/warpsize] = local_out;
|
|
||||||
double sum = 0.0;
|
|
||||||
for(int i = 0; i < bs; ++i){
|
|
||||||
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
|
||||||
}
|
|
||||||
|
|
||||||
const unsigned int row = target_block_row*bs + lane;
|
|
||||||
x[row] = sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
target_block_row += num_warps_in_grid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string OpenclKernels::get_stdwell_apply_source(bool reorder) {
|
|
||||||
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
|
|
||||||
std::string s = "__kernel void " + kernel_name + R"((
|
|
||||||
__global const double *Cnnzs,
|
|
||||||
__global const double *Dnnzs,
|
|
||||||
__global const double *Bnnzs,
|
|
||||||
__global const int *Ccols,
|
|
||||||
__global const int *Bcols,
|
|
||||||
__global const double *x,
|
|
||||||
__global double *y,
|
|
||||||
)";
|
|
||||||
if (reorder) {
|
|
||||||
s += R"(__global const int *toOrder,
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
s += R"(const unsigned int dim,
|
|
||||||
const unsigned int dim_wells,
|
|
||||||
__global const unsigned int *val_pointers,
|
|
||||||
__local double *localSum,
|
|
||||||
__local double *z1,
|
|
||||||
__local double *z2){
|
|
||||||
int wgId = get_group_id(0);
|
|
||||||
int wiId = get_local_id(0);
|
|
||||||
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
|
||||||
int valsPerBlock = dim*dim_wells;
|
|
||||||
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
|
||||||
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
|
||||||
int c = wiId % dim;
|
|
||||||
int r = (wiId/dim) % dim_wells;
|
|
||||||
double temp;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
localSum[wiId] = 0;
|
|
||||||
if(wiId < numActiveWorkItems){
|
|
||||||
int b = wiId/valsPerBlock + val_pointers[wgId];
|
|
||||||
while(b < valSize + val_pointers[wgId]){
|
|
||||||
)";
|
|
||||||
if (reorder) {
|
|
||||||
s += "int colIdx = toOrder[Bcols[b]]; ";
|
|
||||||
} else {
|
|
||||||
s += "int colIdx = Bcols[b]; ";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
|
||||||
b += numBlocksPerWarp;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(wiId < valsPerBlock){
|
|
||||||
localSum[wiId] += localSum[wiId + valsPerBlock];
|
|
||||||
}
|
|
||||||
|
|
||||||
b = wiId/valsPerBlock + val_pointers[wgId];
|
|
||||||
|
|
||||||
if(c == 0 && wiId < valsPerBlock){
|
|
||||||
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
|
||||||
localSum[wiId] += localSum[wiId + stride];
|
|
||||||
}
|
|
||||||
z1[r] = localSum[wiId];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if(wiId < dim_wells){
|
|
||||||
temp = 0.0;
|
|
||||||
for(unsigned int i = 0; i < dim_wells; ++i){
|
|
||||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
|
||||||
}
|
|
||||||
z2[wiId] = temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if(wiId < dim*valSize){
|
|
||||||
temp = 0.0;
|
|
||||||
int bb = wiId/dim + val_pointers[wgId];
|
|
||||||
for (unsigned int j = 0; j < dim_wells; ++j){
|
|
||||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
if (reorder) {
|
|
||||||
s += "int colIdx = toOrder[Ccols[bb]]; ";
|
|
||||||
} else {
|
|
||||||
s += "int colIdx = Ccols[bb]; ";
|
|
||||||
}
|
|
||||||
s += R"(
|
|
||||||
y[colIdx*dim + c] -= temp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::string OpenclKernels::get_ilu_decomp_source() {
|
|
||||||
return R"(
|
|
||||||
|
|
||||||
// a = a - (b * c)
|
|
||||||
__kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
|
|
||||||
{
|
|
||||||
const unsigned int block_size = 3;
|
|
||||||
const unsigned int hwarp_size = 16;
|
|
||||||
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
|
||||||
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
|
||||||
if(thread_id_in_hwarp < block_size * block_size){
|
|
||||||
const unsigned int row = thread_id_in_hwarp / block_size;
|
|
||||||
const unsigned int col = thread_id_in_hwarp % block_size;
|
|
||||||
double temp = 0.0;
|
|
||||||
for (unsigned int k = 0; k < block_size; k++) {
|
|
||||||
temp += b[block_size * row + k] * c[block_size * k + col];
|
|
||||||
}
|
|
||||||
a[block_size * row + col] -= temp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// c = a * b
|
|
||||||
__kernel void block_mult(__global double *a, __global double *b, __local double *c) {
|
|
||||||
const unsigned int block_size = 3;
|
|
||||||
const unsigned int hwarp_size = 16;
|
|
||||||
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
|
||||||
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
|
||||||
if(thread_id_in_hwarp < block_size * block_size){
|
|
||||||
const unsigned int row = thread_id_in_hwarp / block_size;
|
|
||||||
const unsigned int col = thread_id_in_hwarp % block_size;
|
|
||||||
double temp = 0.0;
|
|
||||||
for (unsigned int k = 0; k < block_size; k++) {
|
|
||||||
temp += a[block_size * row + k] * b[block_size * k + col];
|
|
||||||
}
|
|
||||||
c[block_size * row + col] = temp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// invert 3x3 matrix
|
|
||||||
__kernel void inverter(__global double *matrix, __global double *inverse) {
|
|
||||||
const unsigned int block_size = 3;
|
|
||||||
const unsigned int bs = block_size; // rename to shorter name
|
|
||||||
const unsigned int hwarp_size = 16;
|
|
||||||
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
|
||||||
const unsigned int thread_id_in_hwarp = idx_t % hwarp_size; // thread id in warp (16 threads)
|
|
||||||
if(thread_id_in_hwarp < bs * bs){
|
|
||||||
double t4 = matrix[0] * matrix[4];
|
|
||||||
double t6 = matrix[0] * matrix[5];
|
|
||||||
double t8 = matrix[1] * matrix[3];
|
|
||||||
double t10 = matrix[2] * matrix[3];
|
|
||||||
double t12 = matrix[1] * matrix[6];
|
|
||||||
double t14 = matrix[2] * matrix[6];
|
|
||||||
|
|
||||||
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
|
|
||||||
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
|
|
||||||
double t17 = 1.0 / det;
|
|
||||||
|
|
||||||
const unsigned int r = thread_id_in_hwarp / bs;
|
|
||||||
const unsigned int c = thread_id_in_hwarp % bs;
|
|
||||||
const unsigned int r1 = (r+1) % bs;
|
|
||||||
const unsigned int c1 = (c+1) % bs;
|
|
||||||
const unsigned int r2 = (r+bs-1) % bs;
|
|
||||||
const unsigned int c2 = (c+bs-1) % bs;
|
|
||||||
inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void ilu_decomp(const unsigned int firstRow,
|
|
||||||
const unsigned int lastRow,
|
|
||||||
__global double *LUvals,
|
|
||||||
__global const int *LUcols,
|
|
||||||
__global const int *LUrows,
|
|
||||||
__global double *invDiagVals,
|
|
||||||
__global int *diagIndex,
|
|
||||||
const unsigned int Nb,
|
|
||||||
__local double *pivot){
|
|
||||||
|
|
||||||
const unsigned int bs = 3;
|
|
||||||
const unsigned int hwarp_size = 16;
|
|
||||||
const unsigned int work_group_size = get_local_size(0);
|
|
||||||
const unsigned int work_group_id = get_group_id(0);
|
|
||||||
const unsigned int num_groups = get_num_groups(0);
|
|
||||||
const unsigned int hwarps_per_group = work_group_size / hwarp_size;
|
|
||||||
const unsigned int thread_id_in_group = get_local_id(0); // thread id in work group
|
|
||||||
const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size; // thread id in hwarp (16 threads)
|
|
||||||
const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
|
|
||||||
const unsigned int lmem_offset = hwarp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it
|
|
||||||
// every workitem in a hwarp has the same lmem_offset
|
|
||||||
|
|
||||||
// go through all rows
|
|
||||||
for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
|
|
||||||
{
|
|
||||||
int iRowStart = LUrows[i];
|
|
||||||
int iRowEnd = LUrows[i + 1];
|
|
||||||
|
|
||||||
// go through all elements of the row
|
|
||||||
for (int ij = iRowStart; ij < iRowEnd; ij++) {
|
|
||||||
int j = LUcols[ij];
|
|
||||||
|
|
||||||
if (j < i) {
|
|
||||||
// calculate the pivot of this row
|
|
||||||
block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
|
|
||||||
|
|
||||||
// copy pivot
|
|
||||||
if (thread_id_in_hwarp < bs * bs) {
|
|
||||||
LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
|
|
||||||
}
|
|
||||||
|
|
||||||
int jRowEnd = LUrows[j + 1];
|
|
||||||
int jk = diagIndex[j] + 1;
|
|
||||||
int ik = ij + 1;
|
|
||||||
// subtract that row scaled by the pivot from this row.
|
|
||||||
while (ik < iRowEnd && jk < jRowEnd) {
|
|
||||||
if (LUcols[ik] == LUcols[jk]) {
|
|
||||||
block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
|
|
||||||
ik++;
|
|
||||||
jk++;
|
|
||||||
} else {
|
|
||||||
if (LUcols[ik] < LUcols[jk])
|
|
||||||
{ ik++; }
|
|
||||||
else
|
|
||||||
{ jk++; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// store the inverse in the diagonal
|
|
||||||
inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
|
|
||||||
|
|
||||||
// copy inverse
|
|
||||||
if (thread_id_in_hwarp < bs * bs) {
|
|
||||||
LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace Accelerator
|
} // namespace Accelerator
|
||||||
} // namespace Opm
|
} // namespace Opm
|
||||||
|
|
||||||
|
@ -61,11 +61,6 @@ private:
|
|||||||
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||||
static bool initialized;
|
static bool initialized;
|
||||||
|
|
||||||
enum matrix_operation {
|
|
||||||
spmv_op,
|
|
||||||
residual_op
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||||
@ -86,76 +81,34 @@ private:
|
|||||||
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
|
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
|
||||||
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
|
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
|
||||||
|
|
||||||
/// Generate string with axpy kernel
|
|
||||||
/// a = a + alpha * b
|
|
||||||
static std::string get_axpy_source();
|
|
||||||
|
|
||||||
/// Generate string with scale kernel
|
|
||||||
/// a = a * alpha
|
|
||||||
static std::string get_scale_source();
|
|
||||||
|
|
||||||
/// multiply vector with another vector and a scalar, element-wise
|
|
||||||
/// add result to a third vector
|
|
||||||
static std::string get_vmul_source();
|
|
||||||
|
|
||||||
/// returns partial sums, instead of the final dot product
|
|
||||||
/// partial sums are added on CPU
|
|
||||||
static std::string get_dot_1_source();
|
|
||||||
|
|
||||||
/// returns partial sums, instead of the final norm
|
|
||||||
/// the square root must be computed on CPU
|
|
||||||
static std::string get_norm_source();
|
|
||||||
|
|
||||||
/// Generate string with custom kernel
|
|
||||||
/// This kernel combines some ilubicgstab vector operations into 1
|
|
||||||
/// p = (p - omega * v) * beta + r
|
|
||||||
static std::string get_custom_source();
|
|
||||||
|
|
||||||
/// Transform blocked vector to scalar vector using pressure-weights
|
|
||||||
static std::string get_full_to_pressure_restriction_source();
|
|
||||||
|
|
||||||
/// Add the coarse pressure solution back to the finer, complete solution
|
|
||||||
static std::string get_add_coarse_pressure_correction_source();
|
|
||||||
|
|
||||||
/// Prolongate a vector during the AMG cycle
|
|
||||||
static std::string get_prolongate_vector_source();
|
|
||||||
|
|
||||||
/// b = mat * x
|
|
||||||
/// algorithm based on:
|
|
||||||
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
|
||||||
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
|
||||||
/// or
|
|
||||||
/// res = rhs - (mat * x)
|
|
||||||
static std::string get_blocked_matrix_operation_source(matrix_operation op);
|
|
||||||
static std::string get_matrix_operation_source(matrix_operation op, bool spmv_reset = true);
|
|
||||||
|
|
||||||
/// ILU apply part 1: forward substitution
|
|
||||||
/// solves L*x=y where L is a lower triangular sparse blocked matrix
|
|
||||||
/// this L can be it's own BSR matrix (if full_matrix is false),
|
|
||||||
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
|
|
||||||
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
|
|
||||||
static std::string get_ILU_apply1_source(bool full_matrix);
|
|
||||||
|
|
||||||
/// ILU apply part 2: backward substitution
|
|
||||||
/// solves U*x=y where U is an upper triangular sparse blocked matrix
|
|
||||||
/// this U can be it's own BSR matrix (if full_matrix is false),
|
|
||||||
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
|
|
||||||
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
|
|
||||||
static std::string get_ILU_apply2_source(bool full_matrix);
|
|
||||||
|
|
||||||
/// Generate string with the stdwell_apply kernels
|
|
||||||
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
|
|
||||||
/// the x/y vector is reordered, use toOrder to address that
|
|
||||||
/// \param[in] reorder whether the matrix is reordered or not
|
|
||||||
static std::string get_stdwell_apply_source(bool reorder);
|
|
||||||
|
|
||||||
/// Generate string with the exact ilu decomposition kernel
|
|
||||||
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
|
|
||||||
static std::string get_ilu_decomp_source();
|
|
||||||
|
|
||||||
OpenclKernels(){}; // disable instantiation
|
OpenclKernels(){}; // disable instantiation
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
static const std::string axpy_str;
|
||||||
|
static const std::string scale_str;
|
||||||
|
static const std::string vmul_str;
|
||||||
|
static const std::string dot_1_str;
|
||||||
|
static const std::string norm_str;
|
||||||
|
static const std::string custom_str;
|
||||||
|
static const std::string full_to_pressure_restriction_str;
|
||||||
|
static const std::string add_coarse_pressure_correction_str;
|
||||||
|
static const std::string prolongate_vector_str;
|
||||||
|
static const std::string spmv_blocked_str;
|
||||||
|
static const std::string spmv_str;
|
||||||
|
static const std::string spmv_noreset_str;
|
||||||
|
static const std::string residual_blocked_str;
|
||||||
|
static const std::string residual_str;
|
||||||
|
#if CHOW_PATEL
|
||||||
|
static const std::string ILU_apply1_str;
|
||||||
|
static const std::string ILU_apply2_str;
|
||||||
|
#else
|
||||||
|
static const std::string ILU_apply1_fm_str;
|
||||||
|
static const std::string ILU_apply2_fm_str;
|
||||||
|
#endif
|
||||||
|
static const std::string stdwell_apply_str;
|
||||||
|
static const std::string stdwell_apply_no_reorder_str;
|
||||||
|
static const std::string ILU_decomp_str;
|
||||||
|
|
||||||
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
|
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
|
||||||
|
|
||||||
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||||
|
Loading…
Reference in New Issue
Block a user