/* Copyright 2020 Equinor ASA This file is part of the Open Porous Media project (OPM). OPM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OPM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OPM. If not, see . */ #ifndef OPENCL_HPP #define OPENCL_HPP namespace bda { inline const char* axpy_s = R"( __kernel void axpy( __global double *in, const double a, __global double *out, const int N) { unsigned int NUM_THREADS = get_global_size(0); int idx = get_global_id(0); while(idx < N){ out[idx] += a * in[idx]; idx += NUM_THREADS; } } )"; // returns partial sums, instead of the final dot product inline const char* dot_1_s = R"( __kernel void dot_1( __global double *in1, __global double *in2, __global double *out, const unsigned int N, __local double *tmp) { unsigned int tid = get_local_id(0); unsigned int bsize = get_local_size(0); unsigned int bid = get_global_id(0) / bsize; unsigned int i = get_global_id(0); unsigned int NUM_THREADS = get_global_size(0); double sum = 0.0; while(i < N){ sum += in1[i] * in2[i]; i += NUM_THREADS; } tmp[tid] = sum; barrier(CLK_LOCAL_MEM_FENCE); // do reduction in shared mem for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } barrier(CLK_LOCAL_MEM_FENCE); } // write result for this block to global mem if (tid == 0) out[get_group_id(0)] = tmp[0]; } )"; // returns partial sums, instead of the final norm // the square root must be computed on CPU inline const char* norm_s = R"( __kernel void norm( __global double *in, __global double *out, const unsigned int N, __local double *tmp) { unsigned int tid = get_local_id(0); unsigned int bsize = get_local_size(0); unsigned int bid = get_global_id(0) / bsize; unsigned int i = get_global_id(0); unsigned int NUM_THREADS = get_global_size(0); double local_sum = 0.0; while(i < N){ local_sum += in[i] * in[i]; i += NUM_THREADS; } tmp[tid] = local_sum; barrier(CLK_LOCAL_MEM_FENCE); // do reduction in shared mem for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } barrier(CLK_LOCAL_MEM_FENCE); } // write result for this block to global mem if (tid == 0) out[get_group_id(0)] = tmp[0]; } )"; // p = (p - omega * v) * beta + r inline const char* custom_s = R"( __kernel void custom( __global double *p, __global double *v, __global double *r, const double omega, const double beta, const int N) { const unsigned int NUM_THREADS = get_global_size(0); unsigned int idx = get_global_id(0); while(idx < N){ double res = p[idx]; res -= omega * v[idx]; res *= beta; res += r[idx]; p[idx] = res; idx += NUM_THREADS; } } )"; // b = mat * x // algorithm based on: // Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, // Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 inline const char* spmv_blocked_s = R"( __kernel void spmv_blocked( __global const double *vals, __global const int *cols, __global const int *rows, const int Nb, __global const double *x, __global double *b, const unsigned int block_size, __local double *tmp) { const unsigned int warpsize = 32; const unsigned int bsize = get_local_size(0); const unsigned int idx_b = get_global_id(0) / bsize; const unsigned int idx_t = get_local_id(0); unsigned int idx = idx_b * bsize + idx_t; const unsigned int bs = block_size; const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; const unsigned int num_blocks_per_warp = warpsize/bs/bs; const unsigned int NUM_THREADS = get_global_size(0); const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; unsigned int target_block_row = idx / warpsize; const unsigned int lane = idx_t % warpsize; const unsigned int c = (lane / bs) % bs; const unsigned int r = lane % bs; // for 3x3 blocks: // num_active_threads: 27 // num_blocks_per_warp: 3 while(target_block_row < Nb){ unsigned int first_block = rows[target_block_row]; unsigned int last_block = rows[target_block_row+1]; unsigned int block = first_block + lane / (bs*bs); double local_out = 0.0; if(lane < num_active_threads){ for(; block < last_block; block += num_blocks_per_warp){ double x_elem = x[cols[block]*bs + c]; double A_elem = vals[block*bs*bs + c + r*bs]; local_out += x_elem * A_elem; } } // do reduction in shared mem tmp[lane] = local_out; barrier(CLK_LOCAL_MEM_FENCE); for(unsigned int offset = 3; offset <= 24; offset <<= 1) { if (lane + offset < warpsize) { tmp[lane] += tmp[lane + offset]; } barrier(CLK_LOCAL_MEM_FENCE); } if(lane < bs){ unsigned int row = target_block_row*bs + lane; b[row] = tmp[lane]; } target_block_row += num_warps_in_grid; } } )"; // ILU apply part 1: forward substitution // solves L*x=y where L is a lower triangular sparse blocked matrix inline const char* ILU_apply1_s = R"( __kernel void ILU_apply1( __global const double *Lvals, __global const unsigned int *Lcols, __global const unsigned int *Lrows, const unsigned int LrowSize, __global const double *y, __global double *x, __global const unsigned int *nodesPerColorPrefix, const unsigned int color, const unsigned int block_size, __local double *tmp) { const unsigned int warpsize = 32; const unsigned int bs = block_size; const unsigned int idx_t = get_local_id(0); const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; const unsigned int num_blocks_per_warp = warpsize/bs/bs; const unsigned int NUM_THREADS = get_global_size(0); const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; unsigned int idx = get_global_id(0); unsigned int target_block_row = idx / warpsize; idx += nodesPerColorPrefix[color]; target_block_row += nodesPerColorPrefix[color]; const unsigned int lane = idx_t % warpsize; const unsigned int c = (lane / bs) % bs; const unsigned int r = lane % bs; while(target_block_row < nodesPerColorPrefix[color+1]){ const unsigned int first_block = Lrows[target_block_row]; const unsigned int last_block = Lrows[target_block_row+1]; unsigned int block = first_block + lane / (bs*bs); double local_out = 0.0; if(lane < num_active_threads){ if(lane < bs){ local_out = y[target_block_row*bs+lane]; } for(; block < last_block; block += num_blocks_per_warp){ const double x_elem = x[Lcols[block]*bs + c]; const double A_elem = Lvals[block*bs*bs + c + r*bs]; local_out -= x_elem * A_elem; } } // do reduction in shared mem tmp[lane] = local_out; barrier(CLK_LOCAL_MEM_FENCE); for(unsigned int offset = 3; offset <= 24; offset <<= 1) { if (lane + offset < warpsize) { tmp[lane] += tmp[lane + offset]; } barrier(CLK_LOCAL_MEM_FENCE); } if(lane < bs){ const unsigned int row = target_block_row*bs + lane; x[row] = tmp[lane]; } target_block_row += num_warps_in_grid; } } )"; // ILU apply part 2: backward substitution // solves U*x=y where L is a lower triangular sparse blocked matrix inline const char* ILU_apply2_s = R"( __kernel void ILU_apply2( __global const double *Uvals, __global const int *Ucols, __global const int *Urows, const unsigned int UrowSize, __global const double *invDiagVals, __global double *x, __global const unsigned int *nodesPerColorPrefix, const unsigned int color, const unsigned int block_size, __local double *tmp) { const unsigned int warpsize = 32; const unsigned int bs = block_size; const unsigned int idx_t = get_local_id(0); const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; const unsigned int num_blocks_per_warp = warpsize/bs/bs; const unsigned int NUM_THREADS = get_global_size(0); const unsigned int num_warps_in_grid = NUM_THREADS / warpsize; unsigned int idx = get_global_id(0); unsigned int target_block_row = idx / warpsize; idx += nodesPerColorPrefix[color]; target_block_row += nodesPerColorPrefix[color]; const unsigned int lane = idx_t % warpsize; const unsigned int c = (lane / bs) % bs; const unsigned int r = lane % bs; const double relaxation = 0.9; while(target_block_row < nodesPerColorPrefix[color+1]){ const unsigned int first_block = Urows[UrowSize-target_block_row-1]; const unsigned int last_block = Urows[UrowSize-target_block_row]; unsigned int block = first_block + lane / (bs*bs); double local_out = 0.0; if(lane < num_active_threads){ if(lane < bs){ const unsigned int row = target_block_row*bs+lane; local_out = x[row]; } for(; block < last_block; block += num_blocks_per_warp){ const double x_elem = x[Ucols[block]*bs + c]; const double A_elem = Uvals[block*bs*bs + c + r*bs]; local_out -= x_elem * A_elem; } } // do reduction in shared mem tmp[lane] = local_out; barrier(CLK_LOCAL_MEM_FENCE); for(unsigned int offset = 3; offset <= 24; offset <<= 1) { if (lane + offset < warpsize) { tmp[lane] += tmp[lane + offset]; } barrier(CLK_LOCAL_MEM_FENCE); } local_out = tmp[lane]; if(lane < bs){ tmp[lane + bs*idx_t/warpsize] = local_out; double sum = 0.0; for(int i = 0; i < bs; ++i){ sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; } const unsigned int row = target_block_row*bs + lane; x[row] = relaxation * sum; } target_block_row += num_warps_in_grid; } } )"; inline const char* stdwell_apply_s = R"( __kernel void stdwell_apply(__global const double *Cnnzs, __global const double *Dnnzs, __global const double *Bnnzs, __global const int *Ccols, __global const int *Bcols, __global const double *x, __global double *y, __global const int *toOrder, const unsigned int dim, const unsigned int dim_wells, __global const unsigned int *val_pointers, __local double *localSum, __local double *z1, __local double *z2){ int wgId = get_group_id(0); int wiId = get_local_id(0); int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; int valsPerBlock = dim*dim_wells; int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; int numBlocksPerWarp = get_local_size(0)/valsPerBlock; int c = wiId % dim; int r = (wiId/dim) % dim_wells; double temp; barrier(CLK_LOCAL_MEM_FENCE); localSum[wiId] = 0; if(wiId < numActiveWorkItems){ int b = wiId/valsPerBlock + val_pointers[wgId]; while(b < valSize + val_pointers[wgId]){ int colIdx = toOrder[Bcols[b]]; localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; b += numBlocksPerWarp; } if(wiId < valsPerBlock){ localSum[wiId] += localSum[wiId + valsPerBlock]; } b = wiId/valsPerBlock + val_pointers[wgId]; if(wiId < valsPerBlock){ if(c == 0 || c == 2) {localSum[wiId] += localSum[wiId + 2];} if(c == 0 || c == 1) {localSum[wiId] += localSum[wiId + 1];} } if(c == 0 && wiId < valsPerBlock){ z1[r] = localSum[wiId]; } } barrier(CLK_LOCAL_MEM_FENCE); if(wiId < dim_wells){ temp = 0.0; for(unsigned int i = 0; i < dim_wells; ++i){ temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; } z2[wiId] = temp; } barrier(CLK_LOCAL_MEM_FENCE); if(wiId < dim*valSize){ temp = 0.0; int bb = wiId/dim + val_pointers[wgId]; for (unsigned int j = 0; j < dim_wells; ++j){ temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; } int colIdx = toOrder[Ccols[bb]]; y[colIdx*dim + c] -= temp; } } )"; } // end namespace bda #endif