opm-simulators/opm/simulators/linalg/bda/opencl/kernels/isaiU.cl
2022-01-17 13:57:29 -03:00

85 lines
3.2 KiB
Common Lisp

__kernel void block_add(__global double *mat1, __global double *mat2, __global double *result)
{
const unsigned int bs = 3;
const unsigned int warpsize = 32;
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
const unsigned int idx_t = get_local_id(0);
const unsigned int lane = idx_t % warpsize;
if(lane < num_active_threads){
const unsigned int row = lane % bs;
const unsigned int col = (lane / bs) % bs;
result[bs * row + col] = mat1[bs * row + col] + mat2[bs * row + col];
}
}
__kernel void block_mult_isai(__global double *mat1, __global double *mat2, __global double *result)
{
const unsigned int bs = 3;
const unsigned int warpsize = 32;
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
const unsigned int idx_t = get_local_id(0);
const unsigned int lane = idx_t % warpsize;
if(lane < num_active_threads){
const unsigned int row = lane % bs;
const unsigned int col = (lane / bs) % bs;
double temp = 0.0;
for (unsigned int k = 0; k < bs; k++) {
temp += mat1[bs * row + k] * mat2[bs * k + col];
}
result[bs * row + col] = temp;
}
}
__kernel void isaiU(__global const int *diagIndex,
__global const int *colPtr,
__global const int *rowIndices,
__global const int *mapping,
__global const int *nvc,
__global const int *luIdxs,
__global const int *xxIdxs,
__global const int *dxIdxs,
__global const double *LU,
__global const double *invDiagVals,
__global double *invU,
const unsigned int Nb)
{
const unsigned int warpsize = 32;
const unsigned int idx_b = get_group_id(0);
const unsigned int idx_t = get_local_id(0);
const unsigned int idx = get_global_id(0);
const unsigned int bs = 3;
const unsigned int num_threads = get_global_size(0);
const unsigned int num_warps_in_grid = num_threads / warpsize;
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
const unsigned int num_blocks_per_warp = warpsize / bs / bs;
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
unsigned int tcol = idx / warpsize;
while(tcol < Nb){
const unsigned int frow = colPtr[tcol];
const unsigned int lrow = diagIndex[tcol];
const unsigned int nx = lrow - frow + 1;
if(lane < num_active_threads){
block_add(invU + lrow * bs * bs, invDiagVals + tcol * bs * bs, invU + lrow * bs * bs);
for(unsigned int v = nvc[tcol]; v < nvc[tcol + 1]; v++){
block_mult_sub_isai(invU + xxIdxs[v] * bs * bs, LU + luIdxs[v] * bs * bs, invU + dxIdxs[v] * bs * bs);
}
for(unsigned int xid = 1; xid < nx; xid++){
unsigned int xpos = mapping[lrow - xid];
block_mult_isai(invDiagVals + rowIndices[lrow - xid] * bs * bs, invU + xpos * bs * bs, invU + xpos * bs * bs);
}
}
tcol += num_warps_in_grid;
}
}